src/makecolumns.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262

#!/bin/sh

export LANG=C
export LC_CTYPE=C
export LC_ALL=C
export PATH=${PATH}:`pwd`/../bin/

main() {
    [ -f /usr/local/bin/el ] && EL=/usr/local/bin/el
    [ -f `dirname $0`/../bin/el ] && EL=`dirname $0`/../bin/el

    if [ -z "${EL}" ]; then
      echo "el not found. Get it at 'cvs -d :pserver:anoncvs@cvs.erdgeist.org:/home/cvsroot co el'"
      exit 1
    fi

    if [ $# -ne 1 ]; then
      echo "Syntax: $0 [phonebookdirectory]"
      exit 1
    fi

    # Compile all the binaries
    make all

    printf "Cleaning up old working directory ... "
    rm -rf ../work_`basename ${1#white_}`
    printf "done.\n"
    mkdir -p ../work_`basename ${1#white_}`
    cd ../work_`basename ${1#white_}` || exit 1

    if [ -f "$1/phonebook.db" ]; then
        handle_new_format $1
    elif [ -f $1/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt] ]; then
        handle_old_format $1
    else
        echo "Not a recognized Telefonbuch folder"
    fi
    cd ..
}

get_dword() {
    # $1 file, $2 offset
    set -- `od -tu4 -N4 -j$(( 4*${2:-0} )) ${1}`
    printf "%d\n" $2
}

do_decompress_old() {
    printf "Extracting $2 chunks ... "
    extractblocks "${1}"
    printf "done.\n"

    printf "Decompressing $2 chunks ... "
    number_of_files=`find . -name \*.lha | wc -l`
    reported=0; processed=0
    for archive in *.lha; do
        lha x ${archive} > /dev/null
        rm ${archive}
        [ 1 -eq $(( ( processed++ * 20 ) / number_of_files > reported )) ] && printf "%d%% " $(( ++reported * 5 ))
    done
    [ $reported -lt 10 ] && printf "100% "
    printf "done.\n"
}

do_processfile_old() {
    working_on=`basename ${1}`
    mkdir $working_on && cd ${working_on}
    do_decompress_old "${1}" "${2}"
    cd ..

    printf "Combining $2 into single file ... "
    if [ "${4}" = "convert_zeros" ]; then
        cat ${working_on}/* | tr '\n\0' '\t\n' > $3
    else
        cat ${working_on}/* > $3
    fi
    printf "done.\n"

    rm -rf ${working_on}
}

handle_old_format() {
    echo "Working on $1. Detected pre-2004 Telefonbuch version."
    # Extract teiln.dat
    do_decompress_old $1/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt] "teiln.dat"

    # See how long each filename is
    filename_len=$(( `ls | head -n 1 | wc -c` - 1 ))

    # Get total amount of files, for reporting progress
    number_of_files=`find -E . -depth 1 -regex '^\./[0123456789]+' | wc -l`

    # from 2000F on file 0+3*n is table, so make it default
    table_file=`printf %0${filename_len}d 0`
    vname_file=`printf %0${filename_len}d 2`

    # if supposed vname file is larger than table file,
    # we're having a pre-2000F layout, so switch accordingly
    if [ `stat -f %z ${table_file}` -lt `stat -f %z ${vname_file}` ]; then
        table_file=`printf %0${filename_len}d 2`
        nname_file=`printf %0${filename_len}d 0`
        vname_file=`printf %0${filename_len}d 1`
    else
        nname_file=`printf %0${filename_len}d 1`
    fi

    # Table file has a table header with identical count
    # to nname file's header. Verify this
    if [ `get_dword ${nname_file}` -ne `get_dword ${table_file}` ]; then
        echo "Unknown layout."
        exit
    fi

    # Now loop over all files and dump them
    printf "Splitting decompressed chunks into their columns ... "
    reported=0
    while [ -f ${nname_file} ]; do
        # Get number of entries in this round
        count=`get_dword ${nname_file}`

        # Get offset into first nname
        nname_off=$(( `get_dword ${nname_file} 1` + 1 ))

        # Now get the flags before the nnames
        tail -c +${nname_off} ${nname_file} | tr '\n\0' '\t\n' | head -n ${count} | cut -c -1 >> 01_Flags
        tail -c +${nname_off} ${nname_file} | tr '\n\0' '\t\n' | head -n ${count} | cut -c 2- >> 02_Nachname

        # Extract the vnames
        tr '\n\0' '\t\n' < ${vname_file} | head -n ${count} >> 03_Vorname

        # Offset into first table entry tells us how many
        # fields are in table file
        table_entries=$(( `get_dword ${table_file} 1` / 4 - 1 ))

        # Now iterate over all entries in the table file
        for table_index in `jot ${table_entries}`; do
            table_off=`get_dword ${table_file} ${table_index}`
            tail -c +$(( table_off + 1 )) ${table_file} | tr '\n\0' '\t\n' | head -n ${count} >> `printf %02d_unknown $(( table_index + 3 ))`
        done

        # Advance the filenames. Note, that we need bc because
        # builtin arithmetic treats numbers with leading zeros as octals
        nname_file=`printf "%s + 3\n" ${nname_file} | bc`
        nname_file=`printf %0${filename_len}d ${nname_file}`
        vname_file=`printf "%s + 3\n" ${vname_file} | bc`
        vname_file=`printf %0${filename_len}d ${vname_file}`
        table_file=`printf "%s + 3\n" ${table_file} | bc`
        [ 1 -eq $(( ( table_file * 20 ) / number_of_files > reported )) ] && printf "%d%% " $(( ++reported * 5 ))
        table_file=`printf %0${filename_len}d ${table_file}`
    done
    printf "done.\n"

    # wipe all temporary extracted files
    printf "Cleaning up decompressed chunks ... "
    find -E . -depth 1 -regex '^\./[0123456789]+' -delete
    printf "done.\n"

    # rename our columns extracted from the table file
    mv 04_unknown 04_Namenszusatz
    mv 05_unknown 05_Adresszusatz
    mv 06_unknown 06_Ortszusatz
    mv 08_unknown 08_Hausnummer
    mv 09_unknown 09_Verweise
    mv 10_unknown 10_Postleitzahl
    mv 11_unknown 11_Ort
    mv 12_unknown 12_Vorwahl
    mv 13_unknown 13_Rufnummer
    [ -f 14_unknown ] && mv 14_unknown 14_Email
    [ -f 15_unknown ] && mv 15_unknown 15_Webadresse

    # If street names come in an extra file, extract
    # street names first
    streets=$1/[Dd][Aa][Tt]/[Ss][Tt][Rr][Aa][Ss][Ss][Ee][Nn].[Dd][Aa][Tt]
    [ -f ${streets} ] && do_processfile_old "${streets}" "street name" 99_Strassenname convert_zeros

    # extract street names if 07_unknown contains street indexes
    # instead of street names
    if [ -f 99_Strassenname ]; then
        mv 07_unknown 07_Strassenindex
        printf "Looking up street names from indexes ... "
        cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse
        printf "done.\n"
    else
        mv 07_unknown 07_Strasse
    fi

    karto=$1/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt]
    [ -f ${karto} ] && do_processfile_old ${karto} "geo coordinates" 90_Geokoordinaten_hnr
}

handle_new_format() {
    echo "Working on $1. Detected post-2003 Telefonbuch version."
    printf "Extracting street names ... "
    decompress $1/streets.tl

    cat file_* | tr '\n\0' '\t\n' > 99_Strassenname
    rm file_*
    printf "done.\n"

    printf "Extracting phonebook.db ... "
    decompress $1/phonebook.db | grep -v appropriate

    rows=`find . -name file_\* | wc -l`
    rows=$(( rows / 11 ))
    printf "done.\n"

    # Do enumerations with builtin shell tools. Unfortunally neither
    # jot nor seq are standards
    printf "Splitting decompressed chunks into their columns (11 total) ... 0, "
    f=0; while [ $f -lt $rows ]; do printf "file_%05X " $(( f++ * 11)); done | xargs cat | xxd -ps -c1 > column_0

    for column in 1 2 3 4 5 6 7 8 9 10; do
      printf "%d, " $column
      f=0; while [ $f -lt $rows ]; do printf "file_%05X " $(( column + f++ * 11 )); done | xargs cat | tr '\n\0' '\t\n' > column_${column}
    done
    printf "done.\n"

    printf "Cleaning up decompressed chunks ... "
    find . -name file_\* -delete
    printf "done.\n"

    mv column_0 01_Flags
    mv column_1 02_Nachname
    mv column_2 03_Vorname
    mv column_3 04_05_Namenszusatz_Addresszusatz
    mv column_4 09_Verweise
    mv column_5 07_08_Strassenindex_Hausnummer
    mv column_6 12_Vorwahl
    mv column_7 10_Postleitzahl
    mv column_8 11_Ort
    mv column_9 13_Rufnummer
    mv column_10 14_15_Email_Webadresse

    printf "Looking up street names from indexes ... "
    cut -f 1 07_08_Strassenindex_Hausnummer | ${EL} -0 99_Strassenname > 07_Strasse
    printf "done.\n"

    printf "Splitting house numbers ... "
    sed -E $'s:$:\t:' < 07_08_Strassenindex_Hausnummer | cut -f 2 > 08_Hausnummer
    printf "done.\n"

    if [ -f $1/zip-streets-hn-geo.tl ]; then
      printf "Extracting geo coordinates (precision: house number) ... "
      decompress $1/zip-streets-hn-geo.tl
      cat file_* > 90_Geokoordinaten_hnr
      printf "done.\n"
      printf "Looking up geo coordinates for each phonebook entry ... "
      lam 10_Postleitzahl -s $'\t' 07_Strasse -s $'\t' 08_Hausnummer | mapcoords 90_Geokoordinaten_hnr > 16_Koordinaten
      printf "done.\n"
    elif [ -f $1/zip-streets-geo.tl ]; then
      printf "Extracting geo coordinates (precision: street) ... "
      decompress $1/zip-streets-geo.tl
      cat file_*  > 91_Geokoordinaten_str
      printf "done.\n"
      printf "Looking up geo coordinates for each phonebook entry ... "
      lam 10_Postleitzahl -s $'\t' 07_Strasse | mapcoords 91_Geokoordinaten_str > 16_Koordinaten
      printf "done.\n"
    fi
    rm file_*
}

# After function definitions, main() can use them
main "$@"