summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDirk Engling <erdgeist@erdgeist.org>2019-03-04 02:07:53 +0100
committerDirk Engling <erdgeist@erdgeist.org>2019-03-04 02:07:53 +0100
commit2eff180815619302f101a4ff32db6cf293b529c8 (patch)
tree77606dbd045a4c5236fe899927e10f7cf0e8e9e7
parent5ac7e2e4229dab3124ad9d1556ea714a3149fff0 (diff)
unify column cleansing
-rwxr-xr-xmakecolumns.sh169
-rw-r--r--src/export/convert_coords.c6
2 files changed, 135 insertions, 40 deletions
diff --git a/makecolumns.sh b/makecolumns.sh
index 4381f13..3f05a61 100755
--- a/makecolumns.sh
+++ b/makecolumns.sh
@@ -104,14 +104,13 @@ handle_format_version_1() {
104 104
105 # rename our extracted columns 105 # rename our extracted columns
106 mv 01_unknown 01_Flags 106 mv 01_unknown 01_Flags
107
107 mv 02_unknown 02_Nachname 108 mv 02_unknown 02_Nachname
108 mv 03_unknown 03_Vorname 109 mv 03_unknown 03_Vorname
109 mv 04_unknown 05_Adresszusatz
110 mv 05_unknown 06_Ortszusatz 110 mv 05_unknown 06_Ortszusatz
111 mv 06_unknown 10_Zustellamt_PLZOst 111 mv 06_unknown 10_Zustellamt_PLZOst
112 mv 07_unknown 07_Strasse 112 mv 07_unknown 07_Strasse
113 mv 08_unknown 08_Hausnummer 113 mv 08_unknown 08_Hausnummer
114 mv 09_unknown 04_Namenszusatz
115 mv 10_unknown 09_Verweise 114 mv 10_unknown 09_Verweise
116 mv 11_unknown 12_Vorwahl 115 mv 11_unknown 12_Vorwahl
117 mv 12_unknown 13_Rufnummer 116 mv 12_unknown 13_Rufnummer
@@ -119,10 +118,19 @@ handle_format_version_1() {
119 mv 14_unknown 10_Postleitzahl 118 mv 14_unknown 10_Postleitzahl
120 119
121 printf "Normalizing zusaetze ... " 120 printf "Normalizing zusaetze ... "
122 paste 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze 121 sed -E -e 's:^, +:u. :' 09_unknown > 04_Namenszusatz
122 sed -E -e 's:^, +:u. :' 04_unknown > 05_Adresszusatz
123 paste 04_Namenszusatz 05_Adresszusatz | awk '{$1=$1};1' > 04_Zusaetze
123 printf "done.\n" 124 printf "done.\n"
124 125
125 tidy_streetnames 07_Strasse 126 # For consistency, create files with empty lines
127 tr -dC '\n' < 01_Flags > 14_Webadresse
128 cp 14_Webadresse 15_Email
129 sed $'s:.*:\t:' 01_Flags > 16_Koordinaten
130
131 tidy_columns
132
133 rm ??_unknown
126} 134}
127 135
128handle_format_version_2() { 136handle_format_version_2() {
@@ -150,8 +158,6 @@ handle_format_version_2() {
150 mv 01_unknown 01_Flags 158 mv 01_unknown 01_Flags
151 mv 16_unknown 02_Nachname 159 mv 16_unknown 02_Nachname
152 mv 07_unknown 03_Vorname 160 mv 07_unknown 03_Vorname
153 mv 14_unknown 04_Namenszusatz
154 mv 11_unknown 05_Adresszusatz
155 mv 12_unknown 06_Ortszusatz 161 mv 12_unknown 06_Ortszusatz
156 mv 08_unknown 07_Strasse 162 mv 08_unknown 07_Strasse
157 mv 10_unknown 08_Hausnummer 163 mv 10_unknown 08_Hausnummer
@@ -162,14 +168,20 @@ handle_format_version_2() {
162 mv 05_unknown 12_Vorwahl 168 mv 05_unknown 12_Vorwahl
163 mv 06_unknown 13_Rufnummer 169 mv 06_unknown 13_Rufnummer
164 170
165 # remove entries that are for searching only
166 rm 03_unknown 04_unknown
167
168 printf "Normalizing zusaetze ... " 171 printf "Normalizing zusaetze ... "
169 paste 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze 172 sed -E -e 's:^, +:u. :' 14_unknown > 04_Namenszusatz
173 sed -E -e 's:^, +:u. :' 11_unknown > 05_Adresszusatz
174 paste 04_Namenszusatz 05_Adresszusatz | awk '{$1=$1};1' > 04_Zusaetze
170 printf "done.\n" 175 printf "done.\n"
171 176
172 tidy_streetnames 07_Strasse 177 # For consistency, create files with empty lines
178 tr -dC '\n' < 01_Flags > 14_Webadresse
179 cp 14_Webadresse 15_Email
180 sed $'s:.*:\t:' 01_Flags > 16_Koordinaten
181
182 tidy_columns
183
184 rm ??_unknown
173} 185}
174 186
175handle_format_version_3() { 187handle_format_version_3() {
@@ -227,11 +239,10 @@ handle_format_version_3() {
227 cut -c 1 < 01_unknown > 01_Flags 239 cut -c 1 < 01_unknown > 01_Flags
228 cut -c 2- < 01_unknown > 02_Nachname 240 cut -c 2- < 01_unknown > 02_Nachname
229 fi 241 fi
230 rm 01_unknown
231 printf "done.\n" 242 printf "done.\n"
232 243
233 printf "Splitting decompress vname chunks into their columns ... " 244 printf "Splitting decompress vname chunks into their columns ... "
234 JOT "%0${filename_len}d" ${vname_file} $(( number_of_files - 1 )) 3 | xargs -n 128 cat | tr '\n\0' '\t\n' | tr -d '\377' > 03_Vorname 245 JOT "%0${filename_len}d" ${vname_file} $(( number_of_files - 1 )) 3 | xargs -n 128 cat | tr '\n\0' '\t\n' | tr -d '\377' | awk '{$1=$1};1' > 03_Vorname
235 printf "done.\n" 246 printf "done.\n"
236 247
237 printf "Splitting decompress table file chunks into their columns ... " 248 printf "Splitting decompress table file chunks into their columns ... "
@@ -256,20 +267,21 @@ handle_format_version_3() {
256 printf "done.\n" 267 printf "done.\n"
257 268
258 # rename our columns extracted from the table file 269 # rename our columns extracted from the table file
259 mv 04_unknown 04_Namenszusatz
260 mv 05_unknown 05_Adresszusatz
261 mv 06_unknown 06_Ortszusatz 270 mv 06_unknown 06_Ortszusatz
262 mv 08_unknown 08_Hausnummer 271 mv 08_unknown 08_Hausnummer
263 mv 09_unknown 09_Verweise
264 mv 10_unknown 10_Postleitzahl 272 mv 10_unknown 10_Postleitzahl
265 mv 11_unknown 11_Ort 273 mv 11_unknown 11_Ort
266 mv 12_unknown 12_Vorwahl 274 mv 12_unknown 12_Vorwahl
267 mv 13_unknown 13_Rufnummer 275 mv 13_unknown 13_Rufnummer
268 if [ -f 14_unknown -a -f 15_unknown ]; then 276 if [ -f 14_unknown ]; then
269 paste 15_unknown 14_unknown | sed 's/[[:space:]]$//g' > 14_15_Email_Webadresse 277 tr '\\' '/' < 14_unknown | iconv -f iso-8859-15 -t utf-8 > 15_Email
270 rm 14_unknown 15_unknown
271 else 278 else
272 tr -dC '\n' < 01_Flags > 14_15_Email_Webadresse 279 tr -dC '\n' < 01_Flags > 15_Email
280 fi
281 if [ -f 15_unknown ]; then
282 tr '\\' '/' < 15_unknown | iconv -f iso-8859-15 -t utf-8 > 14_Webadresse
283 else
284 tr -dC '\n' < 01_Flags > 14_Webadresse
273 fi 285 fi
274 286
275 printf "Normalizing flags ... " 287 printf "Normalizing flags ... "
@@ -278,9 +290,34 @@ handle_format_version_3() {
278 printf "done.\n" 290 printf "done.\n"
279 291
280 printf "Normalizing zusaetze ... " 292 printf "Normalizing zusaetze ... "
281 paste 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E -e 's/ +/ /g' -e 's/^ +//g' -e 's/ +$//g' > 04_Zusaetze 293 sed -E -e 's:^, +:u. :' 04_unknown > 04_Namenszusatz
294 sed -E -e 's:^, +:u. :' 05_unknown > 05_Adresszusatz
295 paste 04_Namenszusatz 05_Adresszusatz | awk '{$1=$1};1' > 04_Zusaetze
296 printf "done.\n"
297
298 printf "Normalizing verweise ... "
299 sed -E -e 's:^\|::g;s:\|$::g;s:\|:, :g' 09_unknown | awk '{$1=$1};1' > 09_Verweise
282 printf "done.\n" 300 printf "done.\n"
283 301
302 # At least 2002_Q3 and 2003_Q1 are known to sport | -separated vname and nname fields
303 # those fields are redundant, as they are being made explicit in 09_-column for these entries
304 if grep -q '|' 03_Vorname; then
305
306 printf "Treating vname fields with pipe separator ... (adds 3 minutes) ... "
307 # Identify entries with | in nname and move content of 09_Verweise to 04_Zusaetze
308 paste 03_Vorname 04_Zusaetze 09_Verweise | sed -E $'s:^([^|]*)\|.*\t(.*)\t(.*):\\1\t\\2 \\3\t:;' > 03_04_09_Temp
309 cut -f 1 03_04_09_Temp | awk '{$1=$1};1' > 03_Vorname
310 cut -f 2 03_04_09_Temp | awk '{$1=$1};1' > 04_Zusaetze
311 cut -f 3 03_04_09_Temp | awk '{$1=$1};1' > 09_Verweise
312 rm 03_04_09_Temp
313
314 # Delete redundant nachnamen values
315 cut -d '|' -f 1 02_Nachname > 02_Nachname.new
316 mv 02_Nachname.new 02_Nachname
317
318 printf "done.\n"
319 fi
320
284 # If street names come in an extra file, extract 321 # If street names come in an extra file, extract
285 # street names first 322 # street names first
286 if [ -f "${streets}" ]; then 323 if [ -f "${streets}" ]; then
@@ -293,30 +330,27 @@ handle_format_version_3() {
293 # instead of street names 330 # instead of street names
294 if [ -f 99_Strassenname ]; then 331 if [ -f 99_Strassenname ]; then
295 mv 07_unknown 07_Strassenindex 332 mv 07_unknown 07_Strassenindex
296 printf "Looking up street names from indexes ... "
297 333
298 # fix up known broken Strassennamen file 334 # fix up known broken Strassennamen file
299 [ `stat -f %z ${streets}` -eq 1642716 ] && printf '9. Str.\n91. Str.\n91er-Str.\n' >> 99_Strassenname 335 [ `stat -f %z ${streets}` -eq 1642716 ] && printf '9. Str.\n91. Str.\n91er-Str.\n' >> 99_Strassenname
300 336
301 tidy_streetnames 99_Strassenname 337 printf "Looking up street names from indexes ... "
302
303 cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse 338 cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse
304 printf "done.\n" 339 printf "done.\n"
305 else 340 else
306 mv 07_unknown 07_Strasse 341 mv 07_unknown 07_Strasse
307 tidy_streetnames 07_Strasse
308 fi 342 fi
309 343
310 if [ -f "${karto}" ]; then 344 if [ -f "${karto}" ]; then
311 do_processfile_version_3 "${karto}" "geo coordinates" 90_Geokoordinaten_hnr_raw 345 do_processfile_version_3 "${karto}" "geo coordinates" 90_Geokoordinaten_hnr_raw
312 346
313 printf "Looking up geo coordinates for each phonebook entry ... " 347 printf "Looking up geo coordinates for each phonebook entry ... "
314 tr '\0' '\n' < 90_Geokoordinaten_hnr_raw | tr ';' '\t' | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr 348 tr '\0;' '\n\t' < 90_Geokoordinaten_hnr_raw | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr
315 rm 90_Geokoordinaten_hnr_raw 349 rm 90_Geokoordinaten_hnr_raw
316 paste 10_Postleitzahl 11_Ort 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten 350 paste 10_Postleitzahl 11_Ort 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten
317 printf "done.\n" 351 printf "done.\n"
318 else 352 else
319 sed $'s:.*:\t' 01_Flags > 16_Koordinaten 353 sed $'s:.*:\t:' 01_Flags > 16_Koordinaten
320 fi 354 fi
321 355
322 if [ -f "${braid}" ]; then 356 if [ -f "${braid}" ]; then
@@ -326,6 +360,10 @@ handle_format_version_3() {
326 map_branches_v3 97_Branchenname < 09_Branchenindex > 09_Branchen 360 map_branches_v3 97_Branchenname < 09_Branchenindex > 09_Branchen
327 printf "done.\n" 361 printf "done.\n"
328 fi 362 fi
363
364 tidy_columns
365
366 rm ??_unknown
329} 367}
330 368
331handle_format_version_4() { 369handle_format_version_4() {
@@ -367,15 +405,14 @@ handle_format_version_4() {
367 if grep -q ^40 column_0; then 405 if grep -q ^40 column_0; then
368 printf "Cleanung up inverted reverse search flags ... " 406 printf "Cleanung up inverted reverse search flags ... "
369 awk '{ a=substr($0,1,1); printf "%x%x\n",index("5670123cdef89ab4",a)%16 ,substr($0,2,1) }' < column_0 > 01_Flags 407 awk '{ a=substr($0,1,1); printf "%x%x\n",index("5670123cdef89ab4",a)%16 ,substr($0,2,1) }' < column_0 > 01_Flags
370 rm column_0
371 printf "done\n" 408 printf "done\n"
372 else 409 else
373 mv column_0 01_Flags 410 mv column_0 01_Flags
374 fi 411 fi
412
375 mv column_1 02_Nachname 413 mv column_1 02_Nachname
376 mv column_2 03_Vorname 414 mv column_2 03_Vorname
377 mv column_3 04_05_Namenszusatz_Addresszusatz 415 mv column_3 04_05_Namenszusatz_Addresszusatz
378 mv column_4 09_Verweise
379 mv column_5 07_08_Strassenindex_Hausnummer 416 mv column_5 07_08_Strassenindex_Hausnummer
380 mv column_6 12_Vorwahl 417 mv column_6 12_Vorwahl
381 mv column_7 10_Postleitzahl 418 mv column_7 10_Postleitzahl
@@ -383,8 +420,6 @@ handle_format_version_4() {
383 mv column_9 13_Rufnummer 420 mv column_9 13_Rufnummer
384 mv column_10 14_15_Email_Webadresse 421 mv column_10 14_15_Email_Webadresse
385 422
386 tidy_streetnames 99_Strassenname
387
388 printf "Looking up street names from indexes ... " 423 printf "Looking up street names from indexes ... "
389 cut -f 1 07_08_Strassenindex_Hausnummer | ${EL} -0 99_Strassenname > 07_Strasse 424 cut -f 1 07_08_Strassenindex_Hausnummer | ${EL} -0 99_Strassenname > 07_Strasse
390 printf "done.\n" 425 printf "done.\n"
@@ -394,7 +429,19 @@ handle_format_version_4() {
394 printf "done.\n" 429 printf "done.\n"
395 430
396 printf "Normalizing zusaetze ... " 431 printf "Normalizing zusaetze ... "
397 tr '\t' ' ' < 04_05_Namenszusatz_Addresszusatz | sed -E s/' +'/' '/g > 04_Zusaetze 432 sed -E -e $'s:(^|\t),: u. :g' 04_05_Namenszusatz_Addresszusatz | awk '{$1=$1};1' > 04_Zusaetze
433 printf "done.\n"
434
435 printf "Normalizing verweise ... "
436 sed -E -e $'s:^\|+::g;s:\|+$::g;s:\|:, :g' column_4 | awk '{$1=$1};1' > 09_Verweise
437 printf "done.\n"
438
439 printf "Splitting webaddress ... "
440 cut -d $'\t' -f 1 14_15_Email_Webadresse | tr '\\' '/' | iconv -f iso-8859-15 -t utf-8 > 14_Webadresse
441 printf "done.\n"
442
443 printf "Splitting email ... "
444 sed $'s:$:\t:' < 14_15_Email_Webadresse | cut -sd $'\t' -f 2 | tr '\\' '/' | iconv -f iso-8859-15 -t utf-8 > 15_Email
398 printf "done.\n" 445 printf "done.\n"
399 446
400 if [ -f "$1/zip-streets-hn-geo.tl" ]; then 447 if [ -f "$1/zip-streets-hn-geo.tl" ]; then
@@ -414,7 +461,7 @@ handle_format_version_4() {
414 paste 10_Postleitzahl 07_Strasse | map_coords 91_Geokoordinaten_str | convert_coords > 16_Koordinaten 461 paste 10_Postleitzahl 07_Strasse | map_coords 91_Geokoordinaten_str | convert_coords > 16_Koordinaten
415 printf "done.\n" 462 printf "done.\n"
416 else 463 else
417 sed $'s:.*:\t' 01_Flags > 16_Koordinaten 464 sed $'s:.*:\t:' 01_Flags > 16_Koordinaten
418 fi 465 fi
419 rm file_* 466 rm file_*
420 467
@@ -429,19 +476,65 @@ handle_format_version_4() {
429 map_branches_v4 97_Branchenname < 09_Verweise > 09_Branchen 476 map_branches_v4 97_Branchenname < 09_Verweise > 09_Branchen
430 printf "done.\n" 477 printf "done.\n"
431 fi 478 fi
479
480 tidy_columns
481 rm column_*
432} 482}
433 483
434tidy_streetnames () { 484tidy_columns () {
435 streets="$1"
436 485
437 printf "Tyding up streetnames ... " 486 printf "Removing backslashes from Nachnamen ... "
487 sed -E -e 's:\\::g' 02_Nachname | awk '{$1=$1};1' | iconv -f iso-8859-15 -t utf-8 > 02_Nachname.new
488 mv 02_Nachname.new 02_Nachname
489 printf "done.\n"
490
491 printf "Unicoding Vornamen ... "
492 iconv -f iso-8859-15 -t utf-8 03_Vorname > 03_Vorname.new
493 mv 03_Vorname.new 03_Vorname
494 printf "done.\n"
438 495
496 printf "Unicoding Zusaetze ... "
497 iconv -f iso-8859-15 -t utf-8 04_Zusaetze > 04_Zusaetze.new
498 mv 04_Zusaetze.new 04_Zusaetze
499 printf "done.\n"
500
501 printf "Tidying up streetnames ... "
439 # Replace any dots at end of line by a single one 502 # Replace any dots at end of line by a single one
440 # finish any str abbreviation without a period with a period 503 # finish any str abbreviation without a period with a period
441 sed -E -i.bak 's/\.+/./g;s/(S|s)tr( |:)?$/\1tr./;s/ -$/ Str./;s/-$/str./' ${streets} 504 sed -E 's/\.+/./g;s/(S|s)tr( |:)?$/\1tr./;s/(.*)-(.*) -/\1-\2-Str./;s/ -$/ Str./;s/-$/str./' 07_Strasse | iconv -f iso-8859-15 -t utf-8 > 07_Strasse.new
442 rm "${streets}".bak 505 mv 07_Strasse.new 07_Strasse
506 printf "done.\n"
443 507
508 printf "Normalizing house numbers ... "
509 sed -E -e 's:^([[:digit:]]+) *([A-Za-z])$:\1 \2:' -e 's: a$: A:;s: b$: B:;s: c$: C:;s: d$: D:;s: e$: E:;s: f$: F:;s: g$: G:;s: h$: H:;s: i$: I:;s: j$: J:;s: k$: K:;s: l$: L:;s: m$: M:;s: n$: N:;s: o$: O:;' 08_Hausnummer | iconv -f iso-8859-15 -t utf-8 > 08_Hausnummer.new
510 mv 08_Hausnummer.new 08_Hausnummer
444 printf "done.\n" 511 printf "done.\n"
512
513 printf "Unicoding Verweise ... "
514 iconv -f iso-8859-15 -t utf-8 09_Verweise > 09_Verweise.new
515 mv 09_Verweise.new 09_Verweise
516 printf "done.\n"
517
518 printf "Unicoding Postleitzahl ... "
519 iconv -f iso-8859-15 -t utf-8 10_Postleitzahl > 10_Postleitzahl.new
520 mv 10_Postleitzahl.new 10_Postleitzahl
521 printf "done.\n"
522
523 printf "Removing trailing * from Ort ... "
524 sed -E -e 's:\*$::' 11_Ort | iconv -f iso-8859-15 -t utf-8 > 11_Ort.new
525 mv 11_Ort.new 11_Ort
526 printf "done.\n"
527
528 printf "Unicoding Vorwahl ... "
529 iconv -f iso-8859-15 -t utf-8 12_Vorwahl > 12_Vorwahl.new
530 mv 12_Vorwahl.new 12_Vorwahl
531 printf "done.\n"
532
533 printf "Unicoding Rufnummer ... "
534 iconv -f iso-8859-15 -t utf-8 13_Rufnummer > 13_Rufnummer.new
535 mv 13_Rufnummer.new 13_Rufnummer
536 printf "done.\n"
537
445} 538}
446 539
447# JOT <format> <begin> <end> <step> 540# JOT <format> <begin> <end> <step>
diff --git a/src/export/convert_coords.c b/src/export/convert_coords.c
index abbbc22..64d7cbe 100644
--- a/src/export/convert_coords.c
+++ b/src/export/convert_coords.c
@@ -31,8 +31,10 @@ int main( )
31 31
32 double lon = l0+th/n; 32 double lon = l0+th/n;
33 double lat = 2.0*atan(pow(F/r,1.0/n))-0.5*M_PI; 33 double lat = 2.0*atan(pow(F/r,1.0/n))-0.5*M_PI;
34 34 if (x > 0 && y > 0)
35 printf("%lf\t%lf\n", lat*180.0/M_PI, lon*180.0/M_PI); 35 printf("%lf\t%lf\n", lat*180.0/M_PI, lon*180.0/M_PI);
36 else
37 printf("\t\n");
36 } else 38 } else
37 printf("\t\n"); 39 printf("\t\n");
38 } 40 }