1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
|
#!/bin/sh
export LANG=C
export LC_CTYPE=C
export LC_ALL=C
export PATH=${PATH}:`pwd`/bin/
main() {
[ -f /usr/local/bin/el ] && EL=/usr/local/bin/el
[ -f `dirname $0`/../bin/el ] && EL=`dirname $0`/../bin/el
if [ -z "${EL}" ]; then
echo "el not found. Get it at 'cvs -d :pserver:anoncvs@cvs.erdgeist.org:/home/cvsroot co el'"
exit 1
fi
if [ $# -ne 1 ]; then
echo "Syntax: $0 [phonebookdirectory]"
exit 1
fi
# Compile all the binaries
make all
printf "Cleaning up old working directory ... "
rm -rf work/`basename "${1#white_}"`
printf "done.\n"
mkdir -p work/`basename "${1#white_}"`
cd work/`basename "${1#white_}"` || exit 1
if [ -f "$1/phonebook.db" ]; then
handle_format_version_4 "${1}"
elif [ -f ${1}/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt] ]; then
handle_format_version_3 "${1}"
elif [ -n "`find "${1}" -iname atb?dd00 -ls -quit`" ]; then
handle_format_version_2 "${1}"
elif [ -n "`find "${1}" -name dpr00000.005 -ls -quit`" ]; then
handle_format_version_1 "${1}"
else
echo "Not a recognized Telefonbuch folder"
fi
cd ../..
}
do_decompress_version_3() {
printf "Extracting $2 chunks ... "
extract_version_3 "${1}"
printf "done.\n"
printf "Decompressing $2 chunks ... "
numfiles=`find . -name \*.lha | wc -l`
reported=0; processed=0
for archive in *.lha; do
lha x ${archive} > /dev/null
rm ${archive}
[ 1 -eq $(( ( ( (processed+=1) * 20 ) / numfiles ) > reported )) ] && printf "%d%% " $(( (reported+=1) * 5 ))
done
[ $reported -lt 10 ] && printf "100% "
printf "done.\n"
}
do_processfile_version_3() {
working_on=`basename ${1}`
mkdir $working_on && cd ${working_on}
do_decompress_version_3 "${1}" "${2}"
cd ..
printf "Combining $2 into single file ... "
if [ "${4}" = "convert_zeros" ]; then
cat ${working_on}/* | tr '\n\0' '\t\n' > $3
else
cat ${working_on}/* > $3
fi
printf "done.\n"
rm -rf ${working_on}
}
size() {
stat -f %z `printf %0${filename_len}d $1`
}
get_dword() {
# $1 file
hexdump -n 4 -v -e '" " 1/4 "%u"' `printf %0${filename_len}d ${1}`
}
handle_format_version_1() {
echo "Working on $1. Detected 1992 Telefonbuch version."
# Extract all dpr database files
printf "Extracting dpr databases ... "
find "$1" -name dpr\*.001 | extract_version_1
printf "done.\n"
# rename our extracted columns
mv 01_unknown 01_Flags
mv 02_unknown 02_Nachname
mv 03_unknown 03_Vorname
mv 04_unknown 05_Adresszusatz
mv 05_unknown 06_Ortszusatz
mv 06_unknown 10_Zustellamt_PLZOst
mv 07_unknown 07_Strasse
mv 08_unknown 08_Hausnummer
mv 09_unknown 04_Namenszusatz
mv 10_unknown 09_Fax_Verweise
mv 11_unknown 12_Vorwahl
mv 12_unknown 13_Rufnummer
mv 13_unknown 11_Ort
mv 14_unknown 10_Postleitzahl
printf "Normalizing zusaetze ... "
paste 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze
printf "done.\n"
tidy_streetnames 07_Strasse
}
handle_format_version_2() {
echo "Working on $1. Detected pre 02/1996 Telefonbuch version."
# Extract all CD's pkware compressed databases
printf "Extracting 3 pkware databases ...\n"
cd=1
for database in `find "$1" -iname atb?dd00`; do
dir=`dirname ${database}`
base=`basename ${database}`
printf " %d/3 in %4s. Decompressing ... " ${cd} "$( basename ${dir} )"
extract_version_2 "${database}" > ${base}.dump
printf ", extracting ... "
indexfile=$( find ${dir} -iname atb?di00 )
split_version_2 "${base}.dump" "${indexfile}"
printf ", cleaning up ... "
rm "${base}.dump"
printf "done.\n"
cd=$((cd+1))
done
mv 01_unknown 01_Flags
mv 16_unknown 02_Nachname
mv 07_unknown 03_Vorname
mv 14_unknown 04_Namenszusatz
mv 11_unknown 05_Adresszusatz
mv 12_unknown 06_Ortszusatz
mv 08_unknown 07_Strasse
mv 10_unknown 08_Hausnummer
mv 13_unknown 09_Fax_Verweise
mv 02_unknown 10_Postleitzahl
mv 15_unknown 11_Ort
mv 09_unknown 11_Ort_Gemeinde
mv 05_unknown 12_Vorwahl
mv 06_unknown 13_Rufnummer
# remove entries that are for searching only
rm 03_unknown 04_unknown
printf "Normalizing zusaetze ... "
paste 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze
printf "done.\n"
tidy_streetnames 07_Strasse
}
handle_format_version_3() {
echo "Working on $1. Detected pre-2004 Telefonbuch version."
# Extract teiln.dat
do_decompress_version_3 $1/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt] "teiln.dat"
# See how long each filename is
export filename_len=$(( `ls | head -n 1 | wc -c` - 1 ))
# Get total amount of files, for reporting progress
number_of_files=`find -E . -depth 1 -regex '^\./[0123456789]+' | wc -l`
# from 2000F on file 0+3*n is table, so make it default
table_file=0; vname_file=2
# if supposed vname file is larger than table file,
# we're having a pre-2000F layout, so switch accordingly
if [ `size ${table_file}` -lt `size ${vname_file}` ]; then
table_file=2; nname_file=0; vname_file=1
else
nname_file=1
fi
# Table file has a table header with identical count
# to nname file's header. Verify this
if [ `get_dword ${nname_file}` -ne `get_dword ${table_file}` ]; then
echo "Unknown layout."
exit
fi
# Now loop over all files and dump them
printf "Splitting decompressed nname chunks into their columns ... "
JOT "%0${filename_len}d" ${nname_file} $(( number_of_files - 1 )) 3 | split_version_3 1 1
# set -- `hexdump -n 8 -v -e '" " 1/4 "%u"' ${file}`
# tail -c +$(( $2 + 1 )) ${file}
# done | tr '\n\0' '\t\n' > 01_02_Flags_Nachname
cut -c 1 < 01_unknown > 01_Flags
cut -c 2- < 01_unknown > 02_Nachname
rm 01_unknown
printf "done.\n"
printf "Splitting decompress vname chunks into their columns ... "
JOT "%0${filename_len}d" ${vname_file} $(( number_of_files - 1 )) 3 | xargs -n 128 cat | tr '\n\0' '\t\n' | tr -d '\377' > 03_Vorname
printf "done.\n"
printf "Splitting decompress table file chunks into their columns ... "
JOT "%0${filename_len}d" ${table_file} $(( number_of_files - 1 )) 3 | split_version_3 4 0
# for file in `jot -w %0${filename_len}d - ${table_file} $(( number_of_files - 1 )) 3`; do
# # Offset into first table entry tells us how many
# # fields are in table file
# set -- `hexdump -n 64 -v -e '" " 1/4 "%u"' ${file}`
# count=$1; table_entries=$(( $2 / 4 - 1 )); shift
#
# # Now iterate over all entries in the table file
# for idx in `jot ${table_entries}`; do
# tail -c +$(( $1 + 1 )) ${file} | tr '\n\0' '\t\n' | head -n ${count} >> `printf %02d_unknown $(( idx + 3 ))`
# shift
# done
# done
printf "done.\n"
# wipe all temporary extracted files
printf "Cleaning up decompressed chunks ... "
find -E . -depth 1 -regex '^\./[0123456789]+' -delete
printf "done.\n"
# rename our columns extracted from the table file
mv 04_unknown 04_Namenszusatz
mv 05_unknown 05_Adresszusatz
mv 06_unknown 06_Ortszusatz
mv 08_unknown 08_Hausnummer
mv 09_unknown 09_Verweise
mv 10_unknown 10_Postleitzahl
mv 11_unknown 11_Ort
mv 12_unknown 12_Vorwahl
mv 13_unknown 13_Rufnummer
[ -f 14_unknown ] && mv 14_unknown 14_Email
[ -f 15_unknown ] && mv 15_unknown 15_Webadresse
printf "Normalizing flags ... "
sed -i.bak -e s:^1$:00:g -e s:^3$:01:g -e s:^2$:02:g 01_Flags
rm 01_Flags.bak
printf "done.\n"
printf "Normalizing zusaetze ... "
paste 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E -e 's/ +/ /g' -e 's/^ +//g' -e 's/ +$//g' > 04_Zusaetze
printf "done.\n"
# If street names come in an extra file, extract
# street names first
streets=$1/[Dd][Aa][Tt]/[Ss][Tt][Rr][Aa][Ss][Ss][Ee][Nn].[Dd][Aa][Tt]
[ -f ${streets} ] && do_processfile_version_3 ${streets} "street name" 99_Strassenname convert_zeros
# extract street names if 07_unknown contains street indexes
# instead of street names
if [ -f 99_Strassenname ]; then
mv 07_unknown 07_Strassenindex
printf "Looking up street names from indexes ... "
# fix up known broken Strassennamen file
[ `stat -f %z ${streets}` -eq 1642716 ] && printf '9. Str.\n91. Str.\n91er-Str.\n' >> 99_Strassenname
tidy_streetnames 99_Strassenname
cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse
printf "done.\n"
else
mv 07_unknown 07_Strasse
tidy_streetnames 07_Strasse
fi
karto=$1/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt]
if [ -f ${karto} ]; then
do_processfile_version_3 ${karto} "geo coordinates" 90_Geokoordinaten_hnr_raw
printf "Looking up geo coordinates for each phonebook entry ... "
tr '\0' '\n' < 90_Geokoordinaten_hnr_raw | tr ';' '\t' | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr
rm 90_Geokoordinaten_hnr_raw
paste 10_Postleitzahl 11_Ort 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten
printf "done.\n"
fi
}
handle_format_version_4() {
if [ -f $1/branchcodes.tl ]; then
is_yp=true
echo "Working on $1. Detected post-2003 Yellow Pages version."
else
unset is_yp
echo "Working on $1. Detected post-2003 Telefonbuch version."
fi
printf "Extracting street names ... "
extract_version_4 $1/streets.tl
cat file_* | tr '\n\0' '\t\n' > 99_Strassenname
rm file_*
printf "done.\n"
printf "Extracting phonebook.db ... "
extract_version_4 $1/phonebook.db
rows=`find . -name file_\* | wc -l`
printf "done.\n"
printf "Splitting decompressed chunks into their columns (11 total) ... 1, "
JOT "file_%05X" 0 $(( rows - 1 )) 11 | xargs -n 128 cat | xxd -ps -c1 > column_0
for col in 1 2 3 4 5 6 7 8 9 10; do
printf "%d, " $(( col + 1 ))
JOT "file_%05X" ${col} $(( rows - 1 )) 11 | xargs -n 128 cat | tr '\n\0' '\t\n' > column_${col}
done
printf "done.\n"
printf "Cleaning up decompressed chunks ... "
find . -name file_\* -delete
printf "done.\n"
# the 'did not object to inverse search' flag is insane and needs to be reversed
if grep -q ^40 column_0; then
printf "Cleanung up inverted reverse search flags ... "
awk '{ a=substr($0,1,1); printf "%x%x\n",index("5670123cdef89ab4",a)%16 ,substr($0,2,1) }' < column_0 > 01_Flags
rm column_0
printf "done\n"
else
mv column_0 01_Flags
fi
mv column_1 02_Nachname
mv column_2 03_Vorname
mv column_3 04_05_Namenszusatz_Addresszusatz
mv column_4 09_Verweise
mv column_5 07_08_Strassenindex_Hausnummer
mv column_6 12_Vorwahl
mv column_7 10_Postleitzahl
mv column_8 11_Ort
mv column_9 13_Rufnummer
mv column_10 14_15_Email_Webadresse
tidy_streetnames 99_Strassenname
printf "Looking up street names from indexes ... "
cut -f 1 07_08_Strassenindex_Hausnummer | ${EL} -0 99_Strassenname > 07_Strasse
printf "done.\n"
printf "Splitting house numbers ... "
sed -E $'s:$:\t:' < 07_08_Strassenindex_Hausnummer | cut -f 2 > 08_Hausnummer
printf "done.\n"
printf "Normalizing zusaetze ... "
tr '\t' ' ' < 04_05_Namenszusatz_Addresszusatz | sed -E s/' +'/' '/g > 04_Zusaetze
printf "done.\n"
if [ -f $1/zip-streets-hn-geo.tl ]; then
printf "Extracting geo coordinates (precision: house number) ... "
extract_version_4 $1/zip-streets-hn-geo.tl
cat file_* > 90_Geokoordinaten_hnr
printf "done.\n"
printf "Looking up geo coordinates for each phonebook entry ... "
paste 10_Postleitzahl 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten
printf "done.\n"
elif [ -f $1/zip-streets-geo.tl ]; then
printf "Extracting geo coordinates (precision: street) ... "
extract_version_4 $1/zip-streets-geo.tl
cat file_* > 91_Geokoordinaten_str
printf "done.\n"
printf "Looking up geo coordinates for each phonebook entry ... "
paste 10_Postleitzahl 07_Strasse | map_coords 91_Geokoordinaten_str | convert_coords > 16_Koordinaten
printf "done.\n"
fi
rm file_*
if [ "${is_yp}" ]; then
printf "Extracting branch names ... "
extract_version_4 $1/branchcodes.tl
cat file_* | tr '\n\0' '\t\n' > 97_Branchenname
rm file_*
printf "done.\n"
printf "Generating branch name index ... "
mkdir branchcodes/
while read index name; do
printf $name > branchcodes/${index}
done < 97_Branchenname
printf "done.\n"
printf "Looking up branch names from codes ... "
map_branches 97_Branchenname < 09_Verweise > 09_Branchen
printf "done.\n"
rm -r branchcodes
fi
}
tidy_streetnames () {
streets=$1
# Replace any dots at end of line by a single one
# finish any str abbreviation without a period with a period
sed -E -i.bak 's/\.+/./g;s/(S|s)tr( |:)?$/\1tr./' ${streets}
rm "${streets}".bak
}
# JOT <format> <begin> <end> <step>
JOT () {
case `uname -s` in
*BSD|Darwin)
jot -w "$1" - "$2" "$3" "$4"
;;
*)
for x in `seq "$2" "$4" "$3"`; do printf "$1 " "$x"; done
;;
esac
}
# After function definitions, main() can use them
main "$@"
|