From 2eff180815619302f101a4ff32db6cf293b529c8 Mon Sep 17 00:00:00 2001 From: Dirk Engling Date: Mon, 4 Mar 2019 02:07:53 +0100 Subject: unify column cleansing --- makecolumns.sh | 169 ++++++++++++++++++++++++++++++++++---------- src/export/convert_coords.c | 6 +- 2 files changed, 135 insertions(+), 40 deletions(-) diff --git a/makecolumns.sh b/makecolumns.sh index 4381f13..3f05a61 100755 --- a/makecolumns.sh +++ b/makecolumns.sh @@ -104,14 +104,13 @@ handle_format_version_1() { # rename our extracted columns mv 01_unknown 01_Flags + mv 02_unknown 02_Nachname mv 03_unknown 03_Vorname - mv 04_unknown 05_Adresszusatz mv 05_unknown 06_Ortszusatz mv 06_unknown 10_Zustellamt_PLZOst mv 07_unknown 07_Strasse mv 08_unknown 08_Hausnummer - mv 09_unknown 04_Namenszusatz mv 10_unknown 09_Verweise mv 11_unknown 12_Vorwahl mv 12_unknown 13_Rufnummer @@ -119,10 +118,19 @@ handle_format_version_1() { mv 14_unknown 10_Postleitzahl printf "Normalizing zusaetze ... " - paste 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze + sed -E -e 's:^, +:u. :' 09_unknown > 04_Namenszusatz + sed -E -e 's:^, +:u. :' 04_unknown > 05_Adresszusatz + paste 04_Namenszusatz 05_Adresszusatz | awk '{$1=$1};1' > 04_Zusaetze printf "done.\n" - tidy_streetnames 07_Strasse + # For consistency, create files with empty lines + tr -dC '\n' < 01_Flags > 14_Webadresse + cp 14_Webadresse 15_Email + sed $'s:.*:\t:' 01_Flags > 16_Koordinaten + + tidy_columns + + rm ??_unknown } handle_format_version_2() { @@ -150,8 +158,6 @@ handle_format_version_2() { mv 01_unknown 01_Flags mv 16_unknown 02_Nachname mv 07_unknown 03_Vorname - mv 14_unknown 04_Namenszusatz - mv 11_unknown 05_Adresszusatz mv 12_unknown 06_Ortszusatz mv 08_unknown 07_Strasse mv 10_unknown 08_Hausnummer @@ -162,14 +168,20 @@ handle_format_version_2() { mv 05_unknown 12_Vorwahl mv 06_unknown 13_Rufnummer - # remove entries that are for searching only - rm 03_unknown 04_unknown - printf "Normalizing zusaetze ... " - paste 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze + sed -E -e 's:^, +:u. :' 14_unknown > 04_Namenszusatz + sed -E -e 's:^, +:u. :' 11_unknown > 05_Adresszusatz + paste 04_Namenszusatz 05_Adresszusatz | awk '{$1=$1};1' > 04_Zusaetze printf "done.\n" - tidy_streetnames 07_Strasse + # For consistency, create files with empty lines + tr -dC '\n' < 01_Flags > 14_Webadresse + cp 14_Webadresse 15_Email + sed $'s:.*:\t:' 01_Flags > 16_Koordinaten + + tidy_columns + + rm ??_unknown } handle_format_version_3() { @@ -227,11 +239,10 @@ handle_format_version_3() { cut -c 1 < 01_unknown > 01_Flags cut -c 2- < 01_unknown > 02_Nachname fi - rm 01_unknown printf "done.\n" printf "Splitting decompress vname chunks into their columns ... " - JOT "%0${filename_len}d" ${vname_file} $(( number_of_files - 1 )) 3 | xargs -n 128 cat | tr '\n\0' '\t\n' | tr -d '\377' > 03_Vorname + JOT "%0${filename_len}d" ${vname_file} $(( number_of_files - 1 )) 3 | xargs -n 128 cat | tr '\n\0' '\t\n' | tr -d '\377' | awk '{$1=$1};1' > 03_Vorname printf "done.\n" printf "Splitting decompress table file chunks into their columns ... " @@ -256,20 +267,21 @@ handle_format_version_3() { printf "done.\n" # rename our columns extracted from the table file - mv 04_unknown 04_Namenszusatz - mv 05_unknown 05_Adresszusatz mv 06_unknown 06_Ortszusatz mv 08_unknown 08_Hausnummer - mv 09_unknown 09_Verweise mv 10_unknown 10_Postleitzahl mv 11_unknown 11_Ort mv 12_unknown 12_Vorwahl mv 13_unknown 13_Rufnummer - if [ -f 14_unknown -a -f 15_unknown ]; then - paste 15_unknown 14_unknown | sed 's/[[:space:]]$//g' > 14_15_Email_Webadresse - rm 14_unknown 15_unknown + if [ -f 14_unknown ]; then + tr '\\' '/' < 14_unknown | iconv -f iso-8859-15 -t utf-8 > 15_Email else - tr -dC '\n' < 01_Flags > 14_15_Email_Webadresse + tr -dC '\n' < 01_Flags > 15_Email + fi + if [ -f 15_unknown ]; then + tr '\\' '/' < 15_unknown | iconv -f iso-8859-15 -t utf-8 > 14_Webadresse + else + tr -dC '\n' < 01_Flags > 14_Webadresse fi printf "Normalizing flags ... " @@ -278,9 +290,34 @@ handle_format_version_3() { printf "done.\n" printf "Normalizing zusaetze ... " - paste 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E -e 's/ +/ /g' -e 's/^ +//g' -e 's/ +$//g' > 04_Zusaetze + sed -E -e 's:^, +:u. :' 04_unknown > 04_Namenszusatz + sed -E -e 's:^, +:u. :' 05_unknown > 05_Adresszusatz + paste 04_Namenszusatz 05_Adresszusatz | awk '{$1=$1};1' > 04_Zusaetze + printf "done.\n" + + printf "Normalizing verweise ... " + sed -E -e 's:^\|::g;s:\|$::g;s:\|:, :g' 09_unknown | awk '{$1=$1};1' > 09_Verweise printf "done.\n" + # At least 2002_Q3 and 2003_Q1 are known to sport | -separated vname and nname fields + # those fields are redundant, as they are being made explicit in 09_-column for these entries + if grep -q '|' 03_Vorname; then + + printf "Treating vname fields with pipe separator ... (adds 3 minutes) ... " + # Identify entries with | in nname and move content of 09_Verweise to 04_Zusaetze + paste 03_Vorname 04_Zusaetze 09_Verweise | sed -E $'s:^([^|]*)\|.*\t(.*)\t(.*):\\1\t\\2 \\3\t:;' > 03_04_09_Temp + cut -f 1 03_04_09_Temp | awk '{$1=$1};1' > 03_Vorname + cut -f 2 03_04_09_Temp | awk '{$1=$1};1' > 04_Zusaetze + cut -f 3 03_04_09_Temp | awk '{$1=$1};1' > 09_Verweise + rm 03_04_09_Temp + + # Delete redundant nachnamen values + cut -d '|' -f 1 02_Nachname > 02_Nachname.new + mv 02_Nachname.new 02_Nachname + + printf "done.\n" + fi + # If street names come in an extra file, extract # street names first if [ -f "${streets}" ]; then @@ -293,30 +330,27 @@ handle_format_version_3() { # instead of street names if [ -f 99_Strassenname ]; then mv 07_unknown 07_Strassenindex - printf "Looking up street names from indexes ... " # fix up known broken Strassennamen file [ `stat -f %z ${streets}` -eq 1642716 ] && printf '9. Str.\n91. Str.\n91er-Str.\n' >> 99_Strassenname - tidy_streetnames 99_Strassenname - + printf "Looking up street names from indexes ... " cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse printf "done.\n" else mv 07_unknown 07_Strasse - tidy_streetnames 07_Strasse fi if [ -f "${karto}" ]; then do_processfile_version_3 "${karto}" "geo coordinates" 90_Geokoordinaten_hnr_raw printf "Looking up geo coordinates for each phonebook entry ... " - tr '\0' '\n' < 90_Geokoordinaten_hnr_raw | tr ';' '\t' | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr + tr '\0;' '\n\t' < 90_Geokoordinaten_hnr_raw | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr rm 90_Geokoordinaten_hnr_raw paste 10_Postleitzahl 11_Ort 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten printf "done.\n" else - sed $'s:.*:\t' 01_Flags > 16_Koordinaten + sed $'s:.*:\t:' 01_Flags > 16_Koordinaten fi if [ -f "${braid}" ]; then @@ -326,6 +360,10 @@ handle_format_version_3() { map_branches_v3 97_Branchenname < 09_Branchenindex > 09_Branchen printf "done.\n" fi + + tidy_columns + + rm ??_unknown } handle_format_version_4() { @@ -367,15 +405,14 @@ handle_format_version_4() { if grep -q ^40 column_0; then printf "Cleanung up inverted reverse search flags ... " awk '{ a=substr($0,1,1); printf "%x%x\n",index("5670123cdef89ab4",a)%16 ,substr($0,2,1) }' < column_0 > 01_Flags - rm column_0 printf "done\n" else mv column_0 01_Flags fi + mv column_1 02_Nachname mv column_2 03_Vorname mv column_3 04_05_Namenszusatz_Addresszusatz - mv column_4 09_Verweise mv column_5 07_08_Strassenindex_Hausnummer mv column_6 12_Vorwahl mv column_7 10_Postleitzahl @@ -383,8 +420,6 @@ handle_format_version_4() { mv column_9 13_Rufnummer mv column_10 14_15_Email_Webadresse - tidy_streetnames 99_Strassenname - printf "Looking up street names from indexes ... " cut -f 1 07_08_Strassenindex_Hausnummer | ${EL} -0 99_Strassenname > 07_Strasse printf "done.\n" @@ -394,7 +429,19 @@ handle_format_version_4() { printf "done.\n" printf "Normalizing zusaetze ... " - tr '\t' ' ' < 04_05_Namenszusatz_Addresszusatz | sed -E s/' +'/' '/g > 04_Zusaetze + sed -E -e $'s:(^|\t),: u. :g' 04_05_Namenszusatz_Addresszusatz | awk '{$1=$1};1' > 04_Zusaetze + printf "done.\n" + + printf "Normalizing verweise ... " + sed -E -e $'s:^\|+::g;s:\|+$::g;s:\|:, :g' column_4 | awk '{$1=$1};1' > 09_Verweise + printf "done.\n" + + printf "Splitting webaddress ... " + cut -d $'\t' -f 1 14_15_Email_Webadresse | tr '\\' '/' | iconv -f iso-8859-15 -t utf-8 > 14_Webadresse + printf "done.\n" + + printf "Splitting email ... " + sed $'s:$:\t:' < 14_15_Email_Webadresse | cut -sd $'\t' -f 2 | tr '\\' '/' | iconv -f iso-8859-15 -t utf-8 > 15_Email printf "done.\n" if [ -f "$1/zip-streets-hn-geo.tl" ]; then @@ -414,7 +461,7 @@ handle_format_version_4() { paste 10_Postleitzahl 07_Strasse | map_coords 91_Geokoordinaten_str | convert_coords > 16_Koordinaten printf "done.\n" else - sed $'s:.*:\t' 01_Flags > 16_Koordinaten + sed $'s:.*:\t:' 01_Flags > 16_Koordinaten fi rm file_* @@ -429,19 +476,65 @@ handle_format_version_4() { map_branches_v4 97_Branchenname < 09_Verweise > 09_Branchen printf "done.\n" fi + + tidy_columns + rm column_* } -tidy_streetnames () { - streets="$1" +tidy_columns () { - printf "Tyding up streetnames ... " + printf "Removing backslashes from Nachnamen ... " + sed -E -e 's:\\::g' 02_Nachname | awk '{$1=$1};1' | iconv -f iso-8859-15 -t utf-8 > 02_Nachname.new + mv 02_Nachname.new 02_Nachname + printf "done.\n" + + printf "Unicoding Vornamen ... " + iconv -f iso-8859-15 -t utf-8 03_Vorname > 03_Vorname.new + mv 03_Vorname.new 03_Vorname + printf "done.\n" + printf "Unicoding Zusaetze ... " + iconv -f iso-8859-15 -t utf-8 04_Zusaetze > 04_Zusaetze.new + mv 04_Zusaetze.new 04_Zusaetze + printf "done.\n" + + printf "Tidying up streetnames ... " # Replace any dots at end of line by a single one # finish any str abbreviation without a period with a period - sed -E -i.bak 's/\.+/./g;s/(S|s)tr( |:)?$/\1tr./;s/ -$/ Str./;s/-$/str./' ${streets} - rm "${streets}".bak + sed -E 's/\.+/./g;s/(S|s)tr( |:)?$/\1tr./;s/(.*)-(.*) -/\1-\2-Str./;s/ -$/ Str./;s/-$/str./' 07_Strasse | iconv -f iso-8859-15 -t utf-8 > 07_Strasse.new + mv 07_Strasse.new 07_Strasse + printf "done.\n" + printf "Normalizing house numbers ... " + sed -E -e 's:^([[:digit:]]+) *([A-Za-z])$:\1 \2:' -e 's: a$: A:;s: b$: B:;s: c$: C:;s: d$: D:;s: e$: E:;s: f$: F:;s: g$: G:;s: h$: H:;s: i$: I:;s: j$: J:;s: k$: K:;s: l$: L:;s: m$: M:;s: n$: N:;s: o$: O:;' 08_Hausnummer | iconv -f iso-8859-15 -t utf-8 > 08_Hausnummer.new + mv 08_Hausnummer.new 08_Hausnummer printf "done.\n" + + printf "Unicoding Verweise ... " + iconv -f iso-8859-15 -t utf-8 09_Verweise > 09_Verweise.new + mv 09_Verweise.new 09_Verweise + printf "done.\n" + + printf "Unicoding Postleitzahl ... " + iconv -f iso-8859-15 -t utf-8 10_Postleitzahl > 10_Postleitzahl.new + mv 10_Postleitzahl.new 10_Postleitzahl + printf "done.\n" + + printf "Removing trailing * from Ort ... " + sed -E -e 's:\*$::' 11_Ort | iconv -f iso-8859-15 -t utf-8 > 11_Ort.new + mv 11_Ort.new 11_Ort + printf "done.\n" + + printf "Unicoding Vorwahl ... " + iconv -f iso-8859-15 -t utf-8 12_Vorwahl > 12_Vorwahl.new + mv 12_Vorwahl.new 12_Vorwahl + printf "done.\n" + + printf "Unicoding Rufnummer ... " + iconv -f iso-8859-15 -t utf-8 13_Rufnummer > 13_Rufnummer.new + mv 13_Rufnummer.new 13_Rufnummer + printf "done.\n" + } # JOT diff --git a/src/export/convert_coords.c b/src/export/convert_coords.c index abbbc22..64d7cbe 100644 --- a/src/export/convert_coords.c +++ b/src/export/convert_coords.c @@ -31,8 +31,10 @@ int main( ) double lon = l0+th/n; double lat = 2.0*atan(pow(F/r,1.0/n))-0.5*M_PI; - - printf("%lf\t%lf\n", lat*180.0/M_PI, lon*180.0/M_PI); + if (x > 0 && y > 0) + printf("%lf\t%lf\n", lat*180.0/M_PI, lon*180.0/M_PI); + else + printf("\t\n"); } else printf("\t\n"); } -- cgit v1.2.3