summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDirk Engling <erdgeist@erdgeist.org>2014-02-11 17:12:51 +0100
committerDirk Engling <erdgeist@erdgeist.org>2014-02-11 17:12:51 +0100
commit9c46deb628e21991606bbf2a23ecb678a40cd243 (patch)
tree5055efc5715f2f66e2e4c658cd8bb536c724e57c
parent31741d636811d5a0ac5a83f3ccce6875d2a92d78 (diff)
Reworked code to split old telefonbuch distributions, the old version was too slow
-rwxr-xr-xsrc/makecolumns.sh104
1 files changed, 49 insertions, 55 deletions
diff --git a/src/makecolumns.sh b/src/makecolumns.sh
index 2df65c9..5d2d90b 100755
--- a/src/makecolumns.sh
+++ b/src/makecolumns.sh
@@ -77,10 +77,8 @@ size() {
77} 77}
78 78
79get_dword() { 79get_dword() {
80 # $1 file, $2 offset 80 # $1 file
81 file=`printf %0${filename_len}d ${1}` 81 hexdump -n 4 -v -e '" " 1/4 "%u"' `printf %0${filename_len}d ${1}`
82 set -- `od -tu4 -N4 -j$(( 4*${2:-0} )) ${file}`
83 printf "%d\n" $2
84} 82}
85 83
86handle_old_format() { 84handle_old_format() {
@@ -113,38 +111,34 @@ handle_old_format() {
113 fi 111 fi
114 112
115 # Now loop over all files and dump them 113 # Now loop over all files and dump them
116 printf "Splitting decompressed chunks into their columns ... " 114 printf "Splitting decompressed nname chunks into their columns ... "
117 reported=0 115 for file in `jot -w %0${filename_len}d - ${nname_file} $(( number_of_files - 1 )) 3`; do
118 while [ -f `printf %0${filename_len}d ${nname_file}` ]; do 116 set -- `hexdump -n 8 -v -e '" " 1/4 "%u"' ${file}`
119 # Get number of entries in this round 117 tail -c +$(( $2 + 1 )) ${file}
120 count=`get_dword ${nname_file}` 118 done | tr '\n\0' '\t\n' > 01_02_Flags_Nachname
121 119 cut -c 1 < 01_02_Flags_Nachname > 01_Flags
122 # Get offset into first nname 120 cut -c 2- < 01_02_Flags_Nachname > 02_Nachname
123 nname_off=$(( `get_dword ${nname_file} 1` + 1 )) 121 rm 01_02_Flags_Nachname
124 122 printf "done.\n"
125 # Now get the flags before the nnames 123
126 tail -c +${nname_off} `printf %0${filename_len}d ${nname_file}` | tr '\n\0' '\t\n' | head -n ${count} | cut -c -1 >> 01_Flags 124 printf "Splitting decompress vname chunks into their columns ... "
127 tail -c +${nname_off} `printf %0${filename_len}d ${nname_file}` | tr '\n\0' '\t\n' | head -n ${count} | cut -c 2- >> 02_Nachname 125 jot -w "%0${filename_len}d" - ${vname_file} $(( number_of_files - 1 )) 3 | xargs cat | tr '\n\0' '\t\n' > 03_Vorname
128 126 printf "done.\n"
129 # Extract the vnames 127
130 tr '\n\0' '\t\n' < `printf %0${filename_len}d ${vname_file}` | head -n ${count} >> 03_Vorname 128 printf "Splitting decompress table file chunks into their columns ... "
131 129 jot -w %0${filename_len}d - ${table_file} $(( number_of_files - 1 )) 3 | splitold
132 # Offset into first table entry tells us how many 130# for file in `jot -w %0${filename_len}d - ${table_file} $(( number_of_files - 1 )) 3`; do
133 # fields are in table file 131# # Offset into first table entry tells us how many
134 table_entries=$(( `get_dword ${table_file} 1` / 4 - 1 )) 132# # fields are in table file
135 133# set -- `hexdump -n 64 -v -e '" " 1/4 "%u"' ${file}`
136 # Now iterate over all entries in the table file 134# count=$1; table_entries=$(( $2 / 4 - 1 )); shift
137 for table_index in `jot ${table_entries}`; do 135#
138 table_off=`get_dword ${table_file} ${table_index}` 136# # Now iterate over all entries in the table file
139 tail -c +$(( table_off + 1 )) `printf %0${filename_len}d ${table_file}` | tr '\n\0' '\t\n' | head -n ${count} >> `printf %02d_unknown $(( table_index + 3 ))` 137# for idx in `jot ${table_entries}`; do
140 done 138# tail -c +$(( $1 + 1 )) ${file} | tr '\n\0' '\t\n' | head -n ${count} >> `printf %02d_unknown $(( idx + 3 ))`
141 139# shift
142 # Advance the filenames. 140# done
143 nname_file=$(( nname_file+3 )) 141# done
144 vname_file=$(( vname_file+3 ))
145 table_file=$(( table_file+3 ))
146 [ 1 -eq $(( ( ( table_file * 20 ) / number_of_files ) > reported )) ] && printf "%d%% " $(( (reported+=1) * 5 ))
147 done
148 printf "done.\n" 142 printf "done.\n"
149 143
150 # wipe all temporary extracted files 144 # wipe all temporary extracted files
@@ -153,17 +147,19 @@ handle_old_format() {
153 printf "done.\n" 147 printf "done.\n"
154 148
155 # rename our columns extracted from the table file 149 # rename our columns extracted from the table file
156 mv 04_unknown 04_Namenszusatz 150 printf "Converting string terminators to line newlines ... "
157 mv 05_unknown 05_Adresszusatz 151 tr '\0' '\n' < 04_unknown > 04_Namenszusatz
158 mv 06_unknown 06_Ortszusatz 152 tr '\0' '\n' < 05_unknown > 05_Adresszusatz
159 mv 08_unknown 08_Hausnummer 153 tr '\0' '\n' < 06_unknown > 06_Ortszusatz
160 mv 09_unknown 09_Verweise 154 tr '\0' '\n' < 08_unknown > 08_Hausnummer
161 mv 10_unknown 10_Postleitzahl 155 tr '\0' '\n' < 09_unknown > 09_Verweise
162 mv 11_unknown 11_Ort 156 tr '\0' '\n' < 10_unknown > 10_Postleitzahl
163 mv 12_unknown 12_Vorwahl 157 tr '\0' '\n' < 11_unknown > 11_Ort
164 mv 13_unknown 13_Rufnummer 158 tr '\0' '\n' < 12_unknown > 12_Vorwahl
165 [ -f 14_unknown ] && mv 14_unknown 14_Email 159 tr '\0' '\n' < 13_unknown > 13_Rufnummer
166 [ -f 15_unknown ] && mv 15_unknown 15_Webadresse 160 [ -f 14_unknown ] && tr '\0' '\n' < 14_unknown > 14_Email
161 [ -f 15_unknown ] && tr '\0' '\n' < 15_unknown > 15_Webadresse
162 printf "done.\n"
167 163
168 # If street names come in an extra file, extract 164 # If street names come in an extra file, extract
169 # street names first 165 # street names first
@@ -173,13 +169,14 @@ handle_old_format() {
173 # extract street names if 07_unknown contains street indexes 169 # extract street names if 07_unknown contains street indexes
174 # instead of street names 170 # instead of street names
175 if [ -f 99_Strassenname ]; then 171 if [ -f 99_Strassenname ]; then
176 mv 07_unknown 07_Strassenindex 172 tr '\0' '\n' < 07_unknown > 07_Strassenindex
177 printf "Looking up street names from indexes ... " 173 printf "Looking up street names from indexes ... "
178 cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse 174 cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse
179 printf "done.\n" 175 printf "done.\n"
180 else 176 else
181 mv 07_unknown 07_Strasse 177 tr '\0' '\n' < 07_unknown > 07_Strasse
182 fi 178 fi
179 rm ??_unknown
183 180
184 karto=$1/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt] 181 karto=$1/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt]
185 [ -f ${karto} ] && do_processfile_old ${karto} "geo coordinates" 90_Geokoordinaten_hnr_raw 182 [ -f ${karto} ] && do_processfile_old ${karto} "geo coordinates" 90_Geokoordinaten_hnr_raw
@@ -187,7 +184,7 @@ handle_old_format() {
187 printf "Looking up geo coordinates for each phonebook entry ... " 184 printf "Looking up geo coordinates for each phonebook entry ... "
188 tr '\0' '\n' < 90_Geokoordinaten_hnr_raw | tr ';' '\t' | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr 185 tr '\0' '\n' < 90_Geokoordinaten_hnr_raw | tr ';' '\t' | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr
189 rm 90_Geokoordinaten_hnr_raw 186 rm 90_Geokoordinaten_hnr_raw
190 lam 10_Postleitzahl -s $'\t' 07_Strasse -s $'\t' 08_Hausnummer | mapcoords 90_Geokoordinaten_hnr | convertcoords > 16_Koordinaten 187 lam 10_Postleitzahl -s $'\t' 11_Ort -s $'\t' 07_Strasse -s $'\t' 08_Hausnummer | mapcoords 90_Geokoordinaten_hnr | convertcoords > 16_Koordinaten
191 printf "done.\n" 188 printf "done.\n"
192} 189}
193 190
@@ -204,17 +201,14 @@ handle_new_format() {
204 decompress $1/phonebook.db 201 decompress $1/phonebook.db
205 202
206 rows=`find . -name file_\* | wc -l` 203 rows=`find . -name file_\* | wc -l`
207 rows=$(( rows / 11 ))
208 printf "done.\n" 204 printf "done.\n"
209 205
210 # Do enumerations with builtin shell tools. Unfortunally neither
211 # jot nor seq are standards
212 printf "Splitting decompressed chunks into their columns (11 total) ... 1, " 206 printf "Splitting decompressed chunks into their columns (11 total) ... 1, "
213 f=-1; while [ $f -lt $rows ]; do printf "file_%05X " $(( (f+=1) * 11)); done | xargs cat | xxd -ps -c1 > column_0 207 jot -w "file_%05X" - 0 $rows 11 | xargs cat | xxd -ps -c1 > column_0
214 208
215 for col in 1 2 3 4 5 6 7 8 9 10; do 209 for col in 1 2 3 4 5 6 7 8 9 10; do
216 printf "%d, " $(( col + 1 )) 210 printf "%d, " $(( col + 1 ))
217 f=-1; while [ $f -lt $rows ]; do printf "file_%05X " $(( col + (f+=1) * 11 )); done | xargs cat | tr '\n\0' '\t\n' > column_${col} 211 jot -w "file_%05X" - ${col} ${rows} 11 | xargs cat | tr '\n\0' '\t\n' > column_${col}
218 done 212 done
219 printf "done.\n" 213 printf "done.\n"
220 214