diff options
Diffstat (limited to 'makecolumns.sh')
-rwxr-xr-x | makecolumns.sh | 286 |
1 files changed, 286 insertions, 0 deletions
diff --git a/makecolumns.sh b/makecolumns.sh new file mode 100755 index 0000000..0f9c5ba --- /dev/null +++ b/makecolumns.sh | |||
@@ -0,0 +1,286 @@ | |||
1 | #!/bin/sh | ||
2 | |||
3 | export LANG=C | ||
4 | export LC_CTYPE=C | ||
5 | export LC_ALL=C | ||
6 | export PATH=${PATH}:`pwd`/../bin/ | ||
7 | |||
8 | main() { | ||
9 | [ -f /usr/local/bin/el ] && EL=/usr/local/bin/el | ||
10 | [ -f `dirname $0`/../bin/el ] && EL=`dirname $0`/../bin/el | ||
11 | |||
12 | if [ -z "${EL}" ]; then | ||
13 | echo "el not found. Get it at 'cvs -d :pserver:anoncvs@cvs.erdgeist.org:/home/cvsroot co el'" | ||
14 | exit 1 | ||
15 | fi | ||
16 | |||
17 | if [ $# -ne 1 ]; then | ||
18 | echo "Syntax: $0 [phonebookdirectory]" | ||
19 | exit 1 | ||
20 | fi | ||
21 | |||
22 | # Compile all the binaries | ||
23 | make binaries | ||
24 | |||
25 | printf "Cleaning up old working directory ... " | ||
26 | rm -rf ../work_`basename "${1#white_}"` | ||
27 | printf "done.\n" | ||
28 | mkdir -p ../work_`basename "${1#white_}"` | ||
29 | cd ../work_`basename "${1#white_}"` || exit 1 | ||
30 | |||
31 | if [ -f "$1/phonebook.db" ]; then | ||
32 | handle_format_version_3 "${1}" | ||
33 | elif [ -f "${1}/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt]" ]; then | ||
34 | handle_format_version_2 "${1}" | ||
35 | elif [ -n "`find "${1}" -name dpr00000.005 -ls -quit`" ]; then | ||
36 | handle_format_version_1 "${1}" | ||
37 | else | ||
38 | echo "Not a recognized Telefonbuch folder" | ||
39 | fi | ||
40 | cd .. | ||
41 | } | ||
42 | |||
43 | do_decompress_version_2() { | ||
44 | printf "Extracting $2 chunks ... " | ||
45 | extract_version_2 "${1}" | ||
46 | printf "done.\n" | ||
47 | |||
48 | printf "Decompressing $2 chunks ... " | ||
49 | numfiles=`find . -name \*.lha | wc -l` | ||
50 | reported=0; processed=0 | ||
51 | for archive in *.lha; do | ||
52 | lha x ${archive} > /dev/null | ||
53 | rm ${archive} | ||
54 | [ 1 -eq $(( ( ( (processed+=1) * 20 ) / numfiles ) > reported )) ] && printf "%d%% " $(( (reported+=1) * 5 )) | ||
55 | done | ||
56 | [ $reported -lt 10 ] && printf "100% " | ||
57 | printf "done.\n" | ||
58 | } | ||
59 | |||
60 | do_processfile_version_2() { | ||
61 | working_on=`basename ${1}` | ||
62 | mkdir $working_on && cd ${working_on} | ||
63 | do_decompress_version_2 "${1}" "${2}" | ||
64 | cd .. | ||
65 | |||
66 | printf "Combining $2 into single file ... " | ||
67 | if [ "${4}" = "convert_zeros" ]; then | ||
68 | cat ${working_on}/* | tr '\n\0' '\t\n' > $3 | ||
69 | else | ||
70 | cat ${working_on}/* > $3 | ||
71 | fi | ||
72 | printf "done.\n" | ||
73 | |||
74 | rm -rf ${working_on} | ||
75 | } | ||
76 | |||
77 | size() { | ||
78 | stat -f %z `printf %0${filename_len}d $1` | ||
79 | } | ||
80 | |||
81 | get_dword() { | ||
82 | # $1 file | ||
83 | hexdump -n 4 -v -e '" " 1/4 "%u"' `printf %0${filename_len}d ${1}` | ||
84 | } | ||
85 | |||
86 | handle_format_version_1() { | ||
87 | echo "Working on $1. Detected pre-02/1996 Telefonbuch version." | ||
88 | # Extract all dpr database files | ||
89 | printf "Extracting dpr databases ... " | ||
90 | find "$1" -name dpr\*.001 | extract_version_1 | ||
91 | printf "done.\n" | ||
92 | |||
93 | # rename our extracted columns | ||
94 | mv 01_unknown 01_Flags | ||
95 | mv 02_unknown 02_Nachname | ||
96 | mv 03_unknown 03_Vorname | ||
97 | mv 04_unknown 05_Adresszusatz | ||
98 | mv 05_unknown 06_Ortszusatz | ||
99 | mv 06_unknown 10_Zustellamt_PLZOst | ||
100 | mv 07_unknown 07_Strasse | ||
101 | mv 08_unknown 08_Hausnummer | ||
102 | mv 09_unknown 04_Namenszusatz | ||
103 | mv 10_unknown 09_Fax_Verweise | ||
104 | mv 11_unknown 12_Vorwahl | ||
105 | mv 12_unknown 13_Rufnummer | ||
106 | mv 13_unknown 11_Ort | ||
107 | mv 14_unknown 10_Postleitzahl | ||
108 | } | ||
109 | |||
110 | |||
111 | handle_format_version_2() { | ||
112 | echo "Working on $1. Detected pre-2004 Telefonbuch version." | ||
113 | # Extract teiln.dat | ||
114 | do_decompress_version_2 $1/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt] "teiln.dat" | ||
115 | |||
116 | # See how long each filename is | ||
117 | export filename_len=$(( `ls | head -n 1 | wc -c` - 1 )) | ||
118 | |||
119 | # Get total amount of files, for reporting progress | ||
120 | number_of_files=`find -E . -depth 1 -regex '^\./[0123456789]+' | wc -l` | ||
121 | |||
122 | # from 2000F on file 0+3*n is table, so make it default | ||
123 | table_file=0; vname_file=2 | ||
124 | |||
125 | # if supposed vname file is larger than table file, | ||
126 | # we're having a pre-2000F layout, so switch accordingly | ||
127 | if [ `size ${table_file}` -lt `size ${vname_file}` ]; then | ||
128 | table_file=2; nname_file=0; vname_file=1 | ||
129 | else | ||
130 | nname_file=1 | ||
131 | fi | ||
132 | |||
133 | # Table file has a table header with identical count | ||
134 | # to nname file's header. Verify this | ||
135 | if [ `get_dword ${nname_file}` -ne `get_dword ${table_file}` ]; then | ||
136 | echo "Unknown layout." | ||
137 | exit | ||
138 | fi | ||
139 | |||
140 | # Now loop over all files and dump them | ||
141 | printf "Splitting decompressed nname chunks into their columns ... " | ||
142 | jot -w %0${filename_len}d - ${nname_file} $(( number_of_files - 1 )) 3 | split_version_2 1 1 | ||
143 | # set -- `hexdump -n 8 -v -e '" " 1/4 "%u"' ${file}` | ||
144 | # tail -c +$(( $2 + 1 )) ${file} | ||
145 | # done | tr '\n\0' '\t\n' > 01_02_Flags_Nachname | ||
146 | cut -c 1 < 01_unknown > 01_Flags | ||
147 | cut -c 2- < 01_unknown > 02_Nachname | ||
148 | rm 01_unknown | ||
149 | printf "done.\n" | ||
150 | |||
151 | printf "Splitting decompress vname chunks into their columns ... " | ||
152 | jot -w "%0${filename_len}d" - ${vname_file} $(( number_of_files - 1 )) 3 | xargs cat | tr '\n\0' '\t\n' | tr -d '\377' > 03_Vorname | ||
153 | printf "done.\n" | ||
154 | |||
155 | printf "Splitting decompress table file chunks into their columns ... " | ||
156 | jot -w %0${filename_len}d - ${table_file} $(( number_of_files - 1 )) 3 | split_version_2 4 0 | ||
157 | # for file in `jot -w %0${filename_len}d - ${table_file} $(( number_of_files - 1 )) 3`; do | ||
158 | # # Offset into first table entry tells us how many | ||
159 | # # fields are in table file | ||
160 | # set -- `hexdump -n 64 -v -e '" " 1/4 "%u"' ${file}` | ||
161 | # count=$1; table_entries=$(( $2 / 4 - 1 )); shift | ||
162 | # | ||
163 | # # Now iterate over all entries in the table file | ||
164 | # for idx in `jot ${table_entries}`; do | ||
165 | # tail -c +$(( $1 + 1 )) ${file} | tr '\n\0' '\t\n' | head -n ${count} >> `printf %02d_unknown $(( idx + 3 ))` | ||
166 | # shift | ||
167 | # done | ||
168 | # done | ||
169 | printf "done.\n" | ||
170 | |||
171 | # wipe all temporary extracted files | ||
172 | printf "Cleaning up decompressed chunks ... " | ||
173 | find -E . -depth 1 -regex '^\./[0123456789]+' -delete | ||
174 | printf "done.\n" | ||
175 | |||
176 | # rename our columns extracted from the table file | ||
177 | mv 04_unknown 04_Namenszusatz | ||
178 | mv 05_unknown 05_Adresszusatz | ||
179 | mv 06_unknown 06_Ortszusatz | ||
180 | mv 08_unknown 08_Hausnummer | ||
181 | mv 09_unknown 09_Verweise | ||
182 | mv 10_unknown 10_Postleitzahl | ||
183 | mv 11_unknown 11_Ort | ||
184 | mv 12_unknown 12_Vorwahl | ||
185 | mv 13_unknown 13_Rufnummer | ||
186 | [ -f 14_unknown ] && mv 14_unknown 14_Email | ||
187 | [ -f 15_unknown ] && mv 15_unknown 15_Webadresse | ||
188 | |||
189 | # If street names come in an extra file, extract | ||
190 | # street names first | ||
191 | streets=$1/[Dd][Aa][Tt]/[Ss][Tt][Rr][Aa][Ss][Ss][Ee][Nn].[Dd][Aa][Tt] | ||
192 | [ -f ${streets} ] && do_processfile_version_2 ${streets} "street name" 99_Strassenname convert_zeros | ||
193 | |||
194 | # extract street names if 07_unknown contains street indexes | ||
195 | # instead of street names | ||
196 | if [ -f 99_Strassenname ]; then | ||
197 | mv 07_unknown 07_Strassenindex | ||
198 | printf "Looking up street names from indexes ... " | ||
199 | cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse | ||
200 | printf "done.\n" | ||
201 | else | ||
202 | mv 07_unknown 07_Strasse | ||
203 | fi | ||
204 | |||
205 | karto=$1/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt] | ||
206 | if [ -f ${karto} ]; then | ||
207 | do_processfile_version_2 ${karto} "geo coordinates" 90_Geokoordinaten_hnr_raw | ||
208 | |||
209 | printf "Looking up geo coordinates for each phonebook entry ... " | ||
210 | tr '\0' '\n' < 90_Geokoordinaten_hnr_raw | tr ';' '\t' | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr | ||
211 | rm 90_Geokoordinaten_hnr_raw | ||
212 | lam 10_Postleitzahl -s $'\t' 11_Ort -s $'\t' 07_Strasse -s $'\t' 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten | ||
213 | printf "done.\n" | ||
214 | fi | ||
215 | } | ||
216 | |||
217 | handle_format_version_3() { | ||
218 | echo "Working on $1. Detected post-2003 Telefonbuch version." | ||
219 | printf "Extracting street names ... " | ||
220 | extract_version_3 $1/streets.tl | ||
221 | |||
222 | cat file_* | tr '\n\0' '\t\n' > 99_Strassenname | ||
223 | rm file_* | ||
224 | printf "done.\n" | ||
225 | |||
226 | printf "Extracting phonebook.db ... " | ||
227 | extract_version_3 $1/phonebook.db | ||
228 | |||
229 | rows=`find . -name file_\* | wc -l` | ||
230 | printf "done.\n" | ||
231 | |||
232 | printf "Splitting decompressed chunks into their columns (11 total) ... 1, " | ||
233 | jot -w "file_%05X" - 0 $(( rows - 1 )) 11 | xargs cat | xxd -ps -c1 > column_0 | ||
234 | |||
235 | for col in 1 2 3 4 5 6 7 8 9 10; do | ||
236 | printf "%d, " $(( col + 1 )) | ||
237 | jot -w "file_%05X" - ${col} $(( rows - 1 )) 11 | xargs cat | tr '\n\0' '\t\n' > column_${col} | ||
238 | done | ||
239 | printf "done.\n" | ||
240 | |||
241 | printf "Cleaning up decompressed chunks ... " | ||
242 | find . -name file_\* -delete | ||
243 | printf "done.\n" | ||
244 | |||
245 | mv column_0 01_Flags | ||
246 | mv column_1 02_Nachname | ||
247 | mv column_2 03_Vorname | ||
248 | mv column_3 04_05_Namenszusatz_Addresszusatz | ||
249 | mv column_4 09_Verweise | ||
250 | mv column_5 07_08_Strassenindex_Hausnummer | ||
251 | mv column_6 12_Vorwahl | ||
252 | mv column_7 10_Postleitzahl | ||
253 | mv column_8 11_Ort | ||
254 | mv column_9 13_Rufnummer | ||
255 | mv column_10 14_15_Email_Webadresse | ||
256 | |||
257 | printf "Looking up street names from indexes ... " | ||
258 | cut -f 1 07_08_Strassenindex_Hausnummer | ${EL} -0 99_Strassenname > 07_Strasse | ||
259 | printf "done.\n" | ||
260 | |||
261 | printf "Splitting house numbers ... " | ||
262 | sed -E $'s:$:\t:' < 07_08_Strassenindex_Hausnummer | cut -f 2 > 08_Hausnummer | ||
263 | printf "done.\n" | ||
264 | |||
265 | if [ -f $1/zip-streets-hn-geo.tl ]; then | ||
266 | printf "Extracting geo coordinates (precision: house number) ... " | ||
267 | extract_version_3 $1/zip-streets-hn-geo.tl | ||
268 | cat file_* > 90_Geokoordinaten_hnr | ||
269 | printf "done.\n" | ||
270 | printf "Looking up geo coordinates for each phonebook entry ... " | ||
271 | lam 10_Postleitzahl -s $'\t' 07_Strasse -s $'\t' 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten | ||
272 | printf "done.\n" | ||
273 | elif [ -f $1/zip-streets-geo.tl ]; then | ||
274 | printf "Extracting geo coordinates (precision: street) ... " | ||
275 | extract_version_3 $1/zip-streets-geo.tl | ||
276 | cat file_* > 91_Geokoordinaten_str | ||
277 | printf "done.\n" | ||
278 | printf "Looking up geo coordinates for each phonebook entry ... " | ||
279 | lam 10_Postleitzahl -s $'\t' 07_Strasse | map_coords 91_Geokoordinaten_str | convert_coords > 16_Koordinaten | ||
280 | printf "done.\n" | ||
281 | fi | ||
282 | rm file_* | ||
283 | } | ||
284 | |||
285 | # After function definitions, main() can use them | ||
286 | main "$@" | ||