diff options
| author | Dirk Engling <erdgeist@erdgeist.org> | 2019-01-30 18:12:18 +0100 |
|---|---|---|
| committer | Dirk Engling <erdgeist@erdgeist.org> | 2019-01-30 18:12:18 +0100 |
| commit | a187241f4e4cf8a592e0a3cc0b61f949e6184a9e (patch) | |
| tree | ee6adb8733dd81698f4a50bf75aeadbd30f68464 | |
| parent | 0150806fbf0cc64e60984f8a99aa45ca734e0735 (diff) | |
Add branch name mapper code for v3
| -rw-r--r-- | Makefile | 9 | ||||
| -rwxr-xr-x | makecolumns.sh | 44 | ||||
| -rw-r--r-- | src/export/map_branches_v3.c | 79 | ||||
| -rw-r--r-- | src/export/map_branches_v4.c (renamed from src/export/map_branches.c) | 0 |
4 files changed, 115 insertions, 17 deletions
| @@ -1,4 +1,4 @@ | |||
| 1 | BINARIES=bin/extract_version_1 bin/extract_version_2 bin/extract_version_3 bin/extract_version_4 bin/split_version_2 bin/split_version_3 bin/map_coords bin/map_branches bin/convert_coords bin/join | 1 | BINARIES=bin/extract_version_1 bin/extract_version_2 bin/extract_version_3 bin/extract_version_4 bin/split_version_2 bin/split_version_3 bin/map_coords bin/map_branches_v3 bin/map_branches_v4 bin/convert_coords bin/join |
| 2 | CFLAGS += -W -Wall -Wextra -O3 # -Weverything -Wno-cast-align -Wno-padded | 2 | CFLAGS += -W -Wall -Wextra -O3 # -Weverything -Wno-cast-align -Wno-padded |
| 3 | 3 | ||
| 4 | all: $(BINARIES) | 4 | all: $(BINARIES) |
| @@ -24,8 +24,11 @@ bin/split_version_2: src/export/split_version_2.c src/export/mystdlib.c | |||
| 24 | bin/map_coords: src/export/map_coords.c src/export/mystdlib.c | 24 | bin/map_coords: src/export/map_coords.c src/export/mystdlib.c |
| 25 | $(CC) $(CFLAGS) -o $@ src/export/map_coords.c src/export/mystdlib.c | 25 | $(CC) $(CFLAGS) -o $@ src/export/map_coords.c src/export/mystdlib.c |
| 26 | 26 | ||
| 27 | bin/map_branches: src/export/map_branches.c | 27 | bin/map_branches_v4: src/export/map_branches_v4.c |
| 28 | $(CC) $(CFLAGS) -o $@ src/export/map_branches.c | 28 | $(CC) $(CFLAGS) -o $@ src/export/map_branches_v4.c |
| 29 | |||
| 30 | bin/map_branches_v3: src/export/map_branches_v3.c | ||
| 31 | $(CC) $(CFLAGS) -o $@ src/export/map_branches_v3.c | ||
| 29 | 32 | ||
| 30 | bin/convert_coords: src/export/convert_coords.c | 33 | bin/convert_coords: src/export/convert_coords.c |
| 31 | $(CC) $(CFLAGS) -o $@ src/export/convert_coords.c -lm | 34 | $(CC) $(CFLAGS) -o $@ src/export/convert_coords.c -lm |
diff --git a/makecolumns.sh b/makecolumns.sh index edd965c..4f4bebc 100755 --- a/makecolumns.sh +++ b/makecolumns.sh | |||
| @@ -171,9 +171,21 @@ handle_format_version_2() { | |||
| 171 | } | 171 | } |
| 172 | 172 | ||
| 173 | handle_format_version_3() { | 173 | handle_format_version_3() { |
| 174 | echo "Working on $1. Detected pre-2004 Telefonbuch version." | 174 | # glob |
| 175 | teiln=`printf "%s" "$1"/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt]` | ||
| 176 | braid=`printf "%s" "$1"/[Dd][Aa][Tt]/[Bb][Rr][Aa][Ii][Dd].[Dd][Aa][Tt]` | ||
| 177 | streets=`printf "%s" "$1"/[Dd][Aa][Tt]/[Ss][Tt][Rr][Aa][Ss][Ss][Ee][Nn].[Dd][Aa][Tt]` | ||
| 178 | karto=`printf "%s" "$1"/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt]` | ||
| 179 | |||
| 180 | if [ -f "${braid}" ]; then | ||
| 181 | echo "Working on $1. Detected pre-2004 Yellow Pages version." | ||
| 182 | is_yp=true | ||
| 183 | else | ||
| 184 | echo "Working on $1. Detected pre-2004 Telefonbuch version." | ||
| 185 | unset is_yp | ||
| 186 | fi | ||
| 175 | # Extract teiln.dat | 187 | # Extract teiln.dat |
| 176 | do_decompress_version_3 "$1"/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt] "teiln.dat" | 188 | do_decompress_version_3 "${teiln}" "teiln.dat" |
| 177 | 189 | ||
| 178 | # See how long each filename is | 190 | # See how long each filename is |
| 179 | export filename_len=$(( `ls | head -n 1 | wc -c` - 1 )) | 191 | export filename_len=$(( `ls | head -n 1 | wc -c` - 1 )) |
| @@ -205,8 +217,14 @@ handle_format_version_3() { | |||
| 205 | # set -- `hexdump -n 8 -v -e '" " 1/4 "%u"' ${file}` | 217 | # set -- `hexdump -n 8 -v -e '" " 1/4 "%u"' ${file}` |
| 206 | # tail -c +$(( $2 + 1 )) ${file} | 218 | # tail -c +$(( $2 + 1 )) ${file} |
| 207 | # done | tr '\n\0' '\t\n' > 01_02_Flags_Nachname | 219 | # done | tr '\n\0' '\t\n' > 01_02_Flags_Nachname |
| 208 | cut -c 1 < 01_unknown > 01_Flags | 220 | if [ "${is_yp}" ]; then |
| 209 | cut -c 2- < 01_unknown > 02_Nachname | 221 | cut -c 1 < 01_unknown > 01_Flags |
| 222 | cut -c 2-7 < 01_unknown > 09_Branchenindex | ||
| 223 | cut -c 8- < 01_unknown > 02_Nachname | ||
| 224 | else | ||
| 225 | cut -c 1 < 01_unknown > 01_Flags | ||
| 226 | cut -c 2- < 01_unknown > 02_Nachname | ||
| 227 | fi | ||
| 210 | rm 01_unknown | 228 | rm 01_unknown |
| 211 | printf "done.\n" | 229 | printf "done.\n" |
| 212 | 230 | ||
| @@ -259,7 +277,6 @@ handle_format_version_3() { | |||
| 259 | 277 | ||
| 260 | # If street names come in an extra file, extract | 278 | # If street names come in an extra file, extract |
| 261 | # street names first | 279 | # street names first |
| 262 | streets="$1"/[Dd][Aa][Tt]/[Ss][Tt][Rr][Aa][Ss][Ss][Ee][Nn].[Dd][Aa][Tt] | ||
| 263 | [ -f "${streets}" ] && do_processfile_version_3 "${streets}" "street name" 99_Strassenname convert_zeros | 280 | [ -f "${streets}" ] && do_processfile_version_3 "${streets}" "street name" 99_Strassenname convert_zeros |
| 264 | 281 | ||
| 265 | # extract street names if 07_unknown contains street indexes | 282 | # extract street names if 07_unknown contains street indexes |
| @@ -280,7 +297,6 @@ handle_format_version_3() { | |||
| 280 | tidy_streetnames 07_Strasse | 297 | tidy_streetnames 07_Strasse |
| 281 | fi | 298 | fi |
| 282 | 299 | ||
| 283 | karto="$1"/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt] | ||
| 284 | if [ -f "${karto}" ]; then | 300 | if [ -f "${karto}" ]; then |
| 285 | do_processfile_version_3 "${karto}" "geo coordinates" 90_Geokoordinaten_hnr_raw | 301 | do_processfile_version_3 "${karto}" "geo coordinates" 90_Geokoordinaten_hnr_raw |
| 286 | 302 | ||
| @@ -290,6 +306,14 @@ handle_format_version_3() { | |||
| 290 | paste 10_Postleitzahl 11_Ort 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten | 306 | paste 10_Postleitzahl 11_Ort 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten |
| 291 | printf "done.\n" | 307 | printf "done.\n" |
| 292 | fi | 308 | fi |
| 309 | |||
| 310 | if [ -f "${braid}" ]; then | ||
| 311 | do_processfile_version_3 "${braid}" "branchen name index" 97_Branchenname convert_zeros | ||
| 312 | |||
| 313 | printf "Looking up branch names from codes ... " | ||
| 314 | map_branches_v3 97_Branchenname < 09_Branchenindex > 09_Branchen | ||
| 315 | printf "done.\n" | ||
| 316 | fi | ||
| 293 | } | 317 | } |
| 294 | 318 | ||
| 295 | handle_format_version_4() { | 319 | handle_format_version_4() { |
| @@ -387,17 +411,9 @@ handle_format_version_4() { | |||
| 387 | rm file_* | 411 | rm file_* |
| 388 | printf "done.\n" | 412 | printf "done.\n" |
| 389 | 413 | ||
| 390 | printf "Generating branch name index ... " | ||
| 391 | mkdir branchcodes/ | ||
| 392 | while read index name; do | ||
| 393 | printf $name > branchcodes/${index} | ||
| 394 | done < 97_Branchenname | ||
| 395 | printf "done.\n" | ||
| 396 | |||
| 397 | printf "Looking up branch names from codes ... " | 414 | printf "Looking up branch names from codes ... " |
| 398 | map_branches 97_Branchenname < 09_Verweise > 09_Branchen | 415 | map_branches 97_Branchenname < 09_Verweise > 09_Branchen |
| 399 | printf "done.\n" | 416 | printf "done.\n" |
| 400 | rm -r branchcodes | ||
| 401 | fi | 417 | fi |
| 402 | } | 418 | } |
| 403 | 419 | ||
diff --git a/src/export/map_branches_v3.c b/src/export/map_branches_v3.c new file mode 100644 index 0000000..22d0036 --- /dev/null +++ b/src/export/map_branches_v3.c | |||
| @@ -0,0 +1,79 @@ | |||
| 1 | #define _WITH_GETLINE | ||
| 2 | #define _GNU_SOURCE | ||
| 3 | #include <stdlib.h> | ||
| 4 | #include <stdint.h> | ||
| 5 | #include <stdio.h> | ||
| 6 | #include <string.h> | ||
| 7 | #include <ctype.h> | ||
| 8 | |||
| 9 | typedef struct { | ||
| 10 | long code; | ||
| 11 | char *name; | ||
| 12 | } branchen_code; | ||
| 13 | |||
| 14 | enum { MAX_CODES = 128 * 1024 }; | ||
| 15 | branchen_code g_codes[MAX_CODES]; | ||
| 16 | long g_code_count; | ||
| 17 | |||
| 18 | static int find_code( const void *key, const void *bc) | ||
| 19 | { | ||
| 20 | return (long)key - ((branchen_code*)bc)->code; | ||
| 21 | } | ||
| 22 | |||
| 23 | static int qsort_cmp( const void *a, const void *b ) | ||
| 24 | { | ||
| 25 | return ((branchen_code*)a)->code - ((branchen_code*)b)->code; | ||
| 26 | } | ||
| 27 | |||
| 28 | int main( int argc, char ** args ) | ||
| 29 | { | ||
| 30 | FILE * map_file; | ||
| 31 | char *end_p, *input = malloc(1024); | ||
| 32 | size_t input_length = 1024; | ||
| 33 | ssize_t ll; | ||
| 34 | |||
| 35 | if( argc != 2 ) { fprintf( stderr, "Syntax: %s <branchcodes> < <branches_files>\n", args[0] ); exit(111); } | ||
| 36 | |||
| 37 | map_file = fopen( args[1], "r" ); | ||
| 38 | if (!map_file || !input) { fprintf( stderr, "Error allocating resources\n" ); exit( 111 ); } | ||
| 39 | |||
| 40 | /* Fill array with maps */ | ||
| 41 | while ( (ll = getline( &input, &input_length, map_file ) ) >= 0 ) { | ||
| 42 | char * r = strchr(input, 10); | ||
| 43 | if (r) *r = 0; | ||
| 44 | g_codes[g_code_count].code = strtoul(input, &end_p, 10); | ||
| 45 | |||
| 46 | if (input == end_p) break; | ||
| 47 | if (*end_p != ';') { fprintf( stderr, "Input error, line: %s\n", input); exit(1); } | ||
| 48 | |||
| 49 | r = strchr(end_p + 1, ';'); | ||
| 50 | if (!r) { fprintf( stderr, "Input error, line: %s\n", input); exit(1); } | ||
| 51 | *r = 0; | ||
| 52 | |||
| 53 | asprintf(&g_codes[g_code_count].name, "%s", end_p + 1) ; | ||
| 54 | // printf( "%ld: %s\n", g_codes[g_code_count].code, g_codes[g_code_count].name); | ||
| 55 | g_code_count++; | ||
| 56 | } | ||
| 57 | |||
| 58 | qsort(g_codes, g_code_count, sizeof(branchen_code), qsort_cmp ); | ||
| 59 | |||
| 60 | /* Now scan lines from 09_Verweise for semicolon separated branchen codes */ | ||
| 61 | while ( (ll = getline( &input, &input_length, stdin ) ) >= 0 ) { | ||
| 62 | char *codes = input; | ||
| 63 | branchen_code *bc; | ||
| 64 | int multiple; | ||
| 65 | for (multiple = 0;; ++multiple) { | ||
| 66 | long code = strtoul(codes, &end_p, 10); | ||
| 67 | if (codes == end_p) break; | ||
| 68 | bc = (branchen_code*)bsearch((void *)(uintptr_t)code, g_codes, g_code_count, sizeof(branchen_code), find_code); | ||
| 69 | if (bc) { | ||
| 70 | if (multiple) putchar(';'); | ||
| 71 | printf("%s", bc->name); | ||
| 72 | } | ||
| 73 | if (*end_p != ';') break; | ||
| 74 | codes = end_p + 1; | ||
| 75 | } | ||
| 76 | putchar(10); | ||
| 77 | } | ||
| 78 | return 0; | ||
| 79 | } | ||
diff --git a/src/export/map_branches.c b/src/export/map_branches_v4.c index 160945d..160945d 100644 --- a/src/export/map_branches.c +++ b/src/export/map_branches_v4.c | |||
