diff options
-rw-r--r-- | Makefile | 5 | ||||
-rwxr-xr-x | makecolumns.sh | 26 | ||||
-rw-r--r-- | src/postprocess/join.c | 81 |
3 files changed, 109 insertions, 3 deletions
@@ -1,4 +1,4 @@ | |||
1 | BINARIES=bin/extract_version_1 bin/extract_version_2 bin/extract_version_3 bin/extract_version_4 bin/split_version_2 bin/split_version_3 bin/map_coords bin/convert_coords | 1 | BINARIES=bin/extract_version_1 bin/extract_version_2 bin/extract_version_3 bin/extract_version_4 bin/split_version_2 bin/split_version_3 bin/map_coords bin/convert_coords bin/join |
2 | CFLAGS += -W -Wall -Wextra -O3 # -Weverything -Wno-cast-align -Wno-padded | 2 | CFLAGS += -W -Wall -Wextra -O3 # -Weverything -Wno-cast-align -Wno-padded |
3 | 3 | ||
4 | all: $(BINARIES) | 4 | all: $(BINARIES) |
@@ -27,6 +27,9 @@ bin/map_coords: src/export/map_coords.c src/export/mystdlib.c | |||
27 | bin/convert_coords: src/export/convert_coords.c | 27 | bin/convert_coords: src/export/convert_coords.c |
28 | $(CC) $(CFLAGS) -o $@ src/export/convert_coords.c -lm | 28 | $(CC) $(CFLAGS) -o $@ src/export/convert_coords.c -lm |
29 | 29 | ||
30 | bin/join: src/postprocess/join.c src/export/mystdlib.c | ||
31 | $(CC) $(CFLAGS) -o $@ src/postprocess/join.c src/export/mystdlib.c -Isrc/export | ||
32 | |||
30 | .PHONY: clean | 33 | .PHONY: clean |
31 | clean: | 34 | clean: |
32 | @rm -f $(BINARIES) | 35 | @rm -f $(BINARIES) |
diff --git a/makecolumns.sh b/makecolumns.sh index 8131379..0854b32 100755 --- a/makecolumns.sh +++ b/makecolumns.sh | |||
@@ -112,6 +112,7 @@ handle_format_version_1() { | |||
112 | lam 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze | 112 | lam 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze |
113 | printf "done.\n" | 113 | printf "done.\n" |
114 | 114 | ||
115 | tidy_streetnames 07_Strasse | ||
115 | } | 116 | } |
116 | 117 | ||
117 | handle_format_version_2() { | 118 | handle_format_version_2() { |
@@ -158,6 +159,7 @@ handle_format_version_2() { | |||
158 | lam 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze | 159 | lam 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze |
159 | printf "done.\n" | 160 | printf "done.\n" |
160 | 161 | ||
162 | tidy_streetnames 07_Strasse | ||
161 | } | 163 | } |
162 | 164 | ||
163 | handle_format_version_3() { | 165 | handle_format_version_3() { |
@@ -243,7 +245,7 @@ handle_format_version_3() { | |||
243 | printf "done.\n" | 245 | printf "done.\n" |
244 | 246 | ||
245 | printf "Normalizing zusaetze ... " | 247 | printf "Normalizing zusaetze ... " |
246 | lam 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze | 248 | lam 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E -e 's/ +/ /g' -e 's/^ +//g' -e 's/ +$//g' > 04_Zusaetze |
247 | printf "done.\n" | 249 | printf "done.\n" |
248 | 250 | ||
249 | # If street names come in an extra file, extract | 251 | # If street names come in an extra file, extract |
@@ -260,10 +262,13 @@ handle_format_version_3() { | |||
260 | # fix up known broken Strassennamen file | 262 | # fix up known broken Strassennamen file |
261 | [ `stat -f %z ${streets}` -eq 1642716 ] && printf '9. Str.\n91. Str.\n91er-Str.\n' >> 99_Strassenname | 263 | [ `stat -f %z ${streets}` -eq 1642716 ] && printf '9. Str.\n91. Str.\n91er-Str.\n' >> 99_Strassenname |
262 | 264 | ||
265 | tidy_streetnames 99_Strassenname | ||
266 | |||
263 | cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse | 267 | cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse |
264 | printf "done.\n" | 268 | printf "done.\n" |
265 | else | 269 | else |
266 | mv 07_unknown 07_Strasse | 270 | mv 07_unknown 07_Strasse |
271 | tidy_streetnames 07_Strasse | ||
267 | fi | 272 | fi |
268 | 273 | ||
269 | karto=$1/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt] | 274 | karto=$1/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt] |
@@ -306,7 +311,13 @@ handle_format_version_4() { | |||
306 | find . -name file_\* -delete | 311 | find . -name file_\* -delete |
307 | printf "done.\n" | 312 | printf "done.\n" |
308 | 313 | ||
309 | mv column_0 01_Flags | 314 | # the 'did not object to inverse search' flag is insane and needs to be reversed |
315 | if grep -q ^40 column_0; then | ||
316 | awk '{ a=substr($0,1,1); printf "%x%x\n",index("5670123cdef89ab4",a)%16 ,substr($0,2,1) }' < column_0 > 01_Flags | ||
317 | rm column_0 | ||
318 | else | ||
319 | mv column_0 01_Flags | ||
320 | fi | ||
310 | mv column_1 02_Nachname | 321 | mv column_1 02_Nachname |
311 | mv column_2 03_Vorname | 322 | mv column_2 03_Vorname |
312 | mv column_3 04_05_Namenszusatz_Addresszusatz | 323 | mv column_3 04_05_Namenszusatz_Addresszusatz |
@@ -318,6 +329,8 @@ handle_format_version_4() { | |||
318 | mv column_9 13_Rufnummer | 329 | mv column_9 13_Rufnummer |
319 | mv column_10 14_15_Email_Webadresse | 330 | mv column_10 14_15_Email_Webadresse |
320 | 331 | ||
332 | tidy_streetnames 99_Strassenname | ||
333 | |||
321 | printf "Looking up street names from indexes ... " | 334 | printf "Looking up street names from indexes ... " |
322 | cut -f 1 07_08_Strassenindex_Hausnummer | ${EL} -0 99_Strassenname > 07_Strasse | 335 | cut -f 1 07_08_Strassenindex_Hausnummer | ${EL} -0 99_Strassenname > 07_Strasse |
323 | printf "done.\n" | 336 | printf "done.\n" |
@@ -348,6 +361,15 @@ handle_format_version_4() { | |||
348 | printf "done.\n" | 361 | printf "done.\n" |
349 | fi | 362 | fi |
350 | rm file_* | 363 | rm file_* |
364 | |||
365 | } | ||
366 | |||
367 | tidy_streetnames () { | ||
368 | streets=$1 | ||
369 | |||
370 | # Replace any dots at end of line by a single one | ||
371 | # finish any str abbreviation without a period with a period | ||
372 | sed -Ei '' 's/\.+/./g;s/(S|s)tr( |:)?$/\1tr./' ${streets} | ||
351 | } | 373 | } |
352 | 374 | ||
353 | # After function definitions, main() can use them | 375 | # After function definitions, main() can use them |
diff --git a/src/postprocess/join.c b/src/postprocess/join.c new file mode 100644 index 0000000..9782ec8 --- /dev/null +++ b/src/postprocess/join.c | |||
@@ -0,0 +1,81 @@ | |||
1 | #include <stdio.h> | ||
2 | #include <stdlib.h> | ||
3 | #include "mystdlib.h" | ||
4 | |||
5 | #define HUGEBLOCK (1024*1024*256) | ||
6 | #define ZIP_FIELD 3 | ||
7 | #define STREET_FIELD 5 | ||
8 | |||
9 | int rt_strcmp( uint8_t *a, uint8_t *b ) { | ||
10 | while( ( *a != '\n' ) && ( *b != '\n' ) && ( *a == *b ) ) ++a, ++b; | ||
11 | if( *a == *b ) return 0; | ||
12 | return -1; | ||
13 | } | ||
14 | |||
15 | size_t rt_strcpy( uint8_t *dest, uint8_t *src ) { | ||
16 | uint8_t *d = dest; | ||
17 | while( *src != '\n' ) | ||
18 | *dest++ = *src++; | ||
19 | *dest++ = '\n'; | ||
20 | return dest - d; | ||
21 | } | ||
22 | |||
23 | size_t rt_strlen( uint8_t *str ) { | ||
24 | uint8_t *s = str; | ||
25 | while( *str++ != '\n' ); | ||
26 | return str - s; | ||
27 | } | ||
28 | |||
29 | int main( int argc, char **argv ) { | ||
30 | MAP file = map_file( argv[1], 1 ); | ||
31 | uint8_t *out, *in; | ||
32 | size_t last = 3, off = 0, out_off = 0; | ||
33 | int start, end, copy; | ||
34 | |||
35 | (void)argc; | ||
36 | |||
37 | out = malloc( HUGEBLOCK ); | ||
38 | |||
39 | if( !file || !out ) | ||
40 | exit(1); | ||
41 | |||
42 | in = file->addr; | ||
43 | start = 10 * ( in[off] - '0' ) + in[off+1] - '0'; | ||
44 | end = start - 1; | ||
45 | |||
46 | while( off < file->size ) { | ||
47 | int issue = 10 * ( in[off] - '0' ) + in[off+1] - '0'; | ||
48 | off += 3; | ||
49 | copy = 1; | ||
50 | |||
51 | // fprintf( stderr, "issue: %02d start %02d end %02d last %08d off %08d", issue, start, end, last, off ); | ||
52 | switch ( rt_strcmp( in + last, in + off ) ) { | ||
53 | case 1: | ||
54 | last = off; | ||
55 | case 0: | ||
56 | case 2: | ||
57 | if (issue == end + 1 ) copy = 0, end++; | ||
58 | if (issue == end ) copy = 0; | ||
59 | break; | ||
60 | default: | ||
61 | break; | ||
62 | } | ||
63 | // fprintf( stderr, " copy: %d\n", copy ); | ||
64 | |||
65 | if( copy) { | ||
66 | out_off += sprintf( (char*)out + out_off, "%02d%02d\a", start, end ); | ||
67 | out_off += rt_strcpy( out + out_off, in + last ); | ||
68 | start = issue; end = issue; | ||
69 | last = off; | ||
70 | } | ||
71 | |||
72 | off += rt_strlen( in + off ); | ||
73 | |||
74 | if( out_off + 8192 * 2 > HUGEBLOCK ) { | ||
75 | fwrite( out, out_off, 1, stdout ); | ||
76 | out_off = 0; | ||
77 | } | ||
78 | } | ||
79 | |||
80 | return 0; | ||
81 | } | ||