summaryrefslogtreecommitdiff
path: root/src/postprocess
diff options
context:
space:
mode:
authorDirk Engling <erdgeist@erdgeist.org>2019-03-20 04:30:29 +0100
committerDirk Engling <erdgeist@erdgeist.org>2019-03-20 04:30:29 +0100
commitb4bf8417af0d8ebff2c50570c70fdecaf6a53ed9 (patch)
tree4b9341a67c6fc9fd48cae5eecee79ff10ee0fe2f /src/postprocess
parentc4a8bd34b41b2be26426ea01aafc69d41260cce5 (diff)
Add code to lookup new zip codes for 1995 entries and fix up streetnames
Diffstat (limited to 'src/postprocess')
-rw-r--r--src/postprocess/map_plz.c388
-rw-r--r--src/postprocess/map_plz.h39
-rw-r--r--src/postprocess/postprocess-1992.sh32
-rwxr-xr-xsrc/postprocess/simi.py11
4 files changed, 470 insertions, 0 deletions
diff --git a/src/postprocess/map_plz.c b/src/postprocess/map_plz.c
new file mode 100644
index 0000000..ab0db71
--- /dev/null
+++ b/src/postprocess/map_plz.c
@@ -0,0 +1,388 @@
1#include "mystdlib.h"
2
3#include <stdlib.h>
4#include <stdio.h>
5#include <string.h>
6#include <inttypes.h>
7#include <assert.h>
8
9#include "map_plz.h"
10
11static entry_t *g_book, *g_book_by_name;
12static size_t g_book_size, g_zipmap_size;
13static const char * g_zipmap[32000];
14static FILE *g_mapfile_out;
15
16int main(int argc, char **args) {
17 MAP tbuch, zipmap, brutemap;
18 FILE *bfile, *streetfile_out;
19 char *ptr, *input = malloc(65335);
20 char *ort = malloc(65335), vorwahl_block[16];
21 int i, brutes_count = 0, report = 0;
22 brute_t *brutes = malloc(200000*sizeof(brute_t));
23
24 /* prepare io */
25 if (argc != 4) exit(1);
26 tbuch = map_file(args[1], 1);
27 zipmap = map_file(args[2], 1);
28 brutemap = map_file(args[3], 1);
29
30 /* read all files */
31 g_zipmap[0] = (char*)zipmap->addr;
32 for (i=0; i<zipmap->size; ++i)
33 if (!zipmap->addr[i])
34 g_zipmap[++g_zipmap_size] = (char*)(zipmap->addr + i + 1);
35 qsort(g_zipmap, g_zipmap_size, sizeof(char*), cc);
36
37 ptr = (char*)brutemap->addr;
38
39 /* Split brute records */
40 while (ptr < (char*)brutemap->addr + brutemap->size) {
41 brutes[brutes_count].count = atol(ptr); ptr += strlen(ptr) + 1;
42 brutes[brutes_count].similarity = atol(ptr); ptr += strlen(ptr) + 1;
43
44 brutes[brutes_count].vorwahl = ptr; ptr += strlen(ptr) + 1;
45 brutes[brutes_count].ort = ptr; ptr += strlen(ptr) + 1;
46 brutes[brutes_count].strasse_1992 = ptr; ptr += strlen(ptr) + 1;
47 brutes[brutes_count].strasse_1995 = ptr; ptr += strlen(ptr) + 1;
48
49 if (brutes[brutes_count].count >= 20 || brutes[brutes_count].similarity >= 70)
50 brutes_count++;
51 }
52 qsort(brutes, brutes_count, sizeof(brute_t), sort_brutes);
53
54 /* count phonebook lines */
55 for (i=0; i<tbuch->size; ++i)
56 if (!tbuch->addr[i])
57 ++g_book_size;
58
59 /* We expect 8 columns per line */
60 g_book_size /= 8;
61
62 g_book = (entry_t*)malloc(g_book_size * sizeof(entry_t));
63 g_book_by_name = (entry_t*)malloc(g_book_size * sizeof(entry_t));
64
65 /* Split pointers into input files into our arrays */
66 for (i = 0, ptr = (char*)tbuch->addr; i < g_book_size; ++i) {
67 g_book[i].vorwahl = ptr; ptr += strlen(ptr) + 1;
68 g_book[i].ort = ptr; ptr += strlen(ptr) + 1;
69 g_book[i].strasse = ptr; ptr += strlen(ptr) + 1;
70 g_book[i].hnr = ptr; ptr += strlen(ptr) + 1;
71 g_book[i].name = ptr; ptr += strlen(ptr) + 1;
72 g_book[i].vorname = ptr; ptr += strlen(ptr) + 1;
73 g_book[i].nummer = ptr; ptr += strlen(ptr) + 1;
74 g_book[i].plz = ptr; ptr += strlen(ptr) + 1;
75 }
76 /* Copy input array so that it can be sorted by different criteria */
77 memcpy(g_book_by_name, g_book, g_book_size * sizeof(entry_t));
78
79 fprintf(stderr, "STEP 1: import done\n");
80
81 /* Sort the whole thing */
82 qsort(g_book, g_book_size, sizeof(entry_t), sort_by_voshnvn);
83 qsort(g_book_by_name, g_book_size, sizeof(entry_t), sort_by_vonvh);
84
85 fprintf(stderr, "STEP 2: sort done\n");
86
87 g_mapfile_out = fopen("zip_mapfile.txt", "w");
88 streetfile_out = fopen("07_Strasse_fixed", "w");
89 bfile = fopen("brutemap.txt", "w");
90
91 while (1) {
92 char flag, *t, *l = fgets(input, 65536, stdin);
93 const char *orig_strasse;
94 entry_t local;
95 brute_t *found_brute;
96 int once = 0, fixed = 0;
97
98 if (!l) break;
99 if ((report++ % 300000) == 0) fprintf(stderr, "% 10d lines done\n", report);
100
101 /* Scan and skip flags */
102 flag = strtoul(l, &l, 16); ++l;
103
104 /* Copy vorwahl, if in field */
105 local.vorwahl = advance_and_replace(&l, 9, 0);
106
107 /* Copy over vorwahl for whole exported block, if we're not in cont */
108 t = advance_and_replace(&l, 9, 0);
109 if (flag < 2) strcpy(vorwahl_block, t);
110
111 /* Only copy over ort from continuations, if present */
112 if (flag < 2 || (*l != 9) ) local.ort = l; else local.ort = ort;
113 advance_and_replace(&l, 9, 0);
114
115 /* Take copy of ort for continuations, if on start of multi line record */
116 if (flag == 1) strcpy(ort, local.ort);
117
118 /* Copy rest of the fields verbatim */
119 local.strasse = advance_and_replace(&l, 9, 0);
120 local.hnr = advance_and_replace(&l, 9, 0);
121 local.name = advance_and_replace(&l, 9, 0);
122 local.vorname = advance_and_replace(&l, 9, 0);
123 local.nummer = advance_and_replace(&l, 9, 0);
124 local.plz = advance_and_replace(&l, 9, ':');
125 advance_and_replace(&l, 10, 0);
126
127 orig_strasse = local.strasse;
128rescan:
129
130 if (search_and_verify(&local, sort_by_voshnvn, 1) ||
131 search_and_verify(&local, sort_by_voshnv, 1) ||
132 search_and_verify(&local, sort_by_vosh, 0) ||
133 search_and_verify(&local, sort_by_vos, 0)) {
134 fputs(fixed ? local.strasse : orig_strasse, streetfile_out);
135 fputc(10, streetfile_out);
136 continue;
137 }
138
139 /* If we can't find the street, it might be due to an incorrect vorwahl, try to fix it up */
140 if (strcmp(local.vorwahl, vorwahl_block)) {
141 local.vorwahl = vorwahl_block;
142 goto rescan;
143 }
144
145 /* If we do have vorwahl + ort + strasse + hnr, but no match, street might have changed name */
146 if (*local.vorwahl && *local.ort && *local.strasse && *local.hnr && *local.name) {
147 entry_t *found = bsearch_first(&local, g_book_by_name, g_book_size, sizeof(entry_t), sort_by_vonvh);
148 if (found)
149 fprintf(bfile, "%s\t%s\t%s\t%s\t%s\n", local.vorwahl, local.ort, local.strasse, found->strasse, local.hnr);
150 }
151
152 /* See if we can find and correct the street name */
153 if (!once++) {
154 found_brute = bsearch(&local, brutes, brutes_count, sizeof(brute_t), search_brute);
155 if (found_brute && ( (found_brute->similarity >= 70) || (found_brute->count >= 20)) ) {
156 fixed = found_brute->similarity >= 80;
157 local.strasse = found_brute->strasse_1995;
158 goto rescan;
159 }
160 }
161
162 /* If nothing works, see if the whole village has only one zip, else just print an empty line */
163 if (!search_and_verify(&local, sort_by_vo, 2))
164 putchar(10);
165 fputs(orig_strasse, streetfile_out);
166 fputc(10, streetfile_out);
167 }
168}
169
170static char * advance_and_replace(char **p, char find, char replace) {
171 char *copy = *p;
172 *p = strchr(*p, find);
173 **p = replace;
174 ++*p;
175 return copy;
176}
177
178static void *bsearch_first( const void * const key, const void * base, const size_t nel, const size_t width, int (*compar) (const void *, const void *)) {
179 size_t interval = nel;
180 const void * first = base;
181 int cmp;
182
183 while (interval) {
184 uint8_t *lookat = ((uint8_t*)base) + width * ( interval / 2 );
185 cmp = compar(key, (void*)lookat);
186 if(cmp == 0 && ((base == first) || compar(key, (void*)(lookat-width))))
187 return lookat;
188 if(cmp > 0) {
189 base = lookat + width;
190 interval--;
191 }
192 interval /= 2;
193 }
194
195 return 0;
196}
197
198/* For if we have vorwahl and ort and strasse */
199static int sort_by_voshnvn(const void *a, const void *b) {
200 int res;
201 entry_t *ea = (entry_t *)a;
202 entry_t *eb = (entry_t *)b;
203
204 if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res;
205 if ((res = strcmp(ea->ort, eb->ort ))) return res;
206 if ((res = strcmp(ea->strasse, eb->strasse))) return res;
207 if ((res = strcmp(ea->hnr, eb->hnr ))) return res;
208 if ((res = strcmp(ea->name, eb->name ))) return res;
209 if ((res = strcmp(ea->vorname, eb->vorname))) return res;
210 if ((res = strcmp(ea->nummer, eb->nummer ))) return res;
211 return 0;
212}
213
214/* more relaxed, if rufnummer missmatches */
215static int sort_by_voshnv(const void *a, const void *b) {
216 int res;
217 entry_t *ea = (entry_t *)a;
218 entry_t *eb = (entry_t *)b;
219
220 if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res;
221 if ((res = strcmp(ea->ort, eb->ort ))) return res;
222 if ((res = strcmp(ea->strasse, eb->strasse))) return res;
223 if ((res = strcmp(ea->hnr, eb->hnr ))) return res;
224 if ((res = strcmp(ea->name, eb->name ))) return res;
225 if ((res = strcmp(ea->vorname, eb->vorname))) return res;
226 return 0;
227}
228
229/* more relaxed, if rufnummer missmatches */
230static int sort_by_vosh(const void *a, const void *b) {
231 int res;
232 entry_t *ea = (entry_t *)a;
233 entry_t *eb = (entry_t *)b;
234
235 if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res;
236 if ((res = strcmp(ea->ort, eb->ort ))) return res;
237 if ((res = strcmp(ea->strasse, eb->strasse))) return res;
238 if ((res = strcmp(ea->hnr, eb->hnr ))) return res;
239 return 0;
240}
241
242/* more relaxed, if rufnummer missmatches */
243static int sort_by_vos(const void *a, const void *b) {
244 int res;
245 entry_t *ea = (entry_t *)a;
246 entry_t *eb = (entry_t *)b;
247
248 if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res;
249 if ((res = strcmp(ea->ort, eb->ort ))) return res;
250 if ((res = strcmp(ea->strasse, eb->strasse))) return res;
251 return 0;
252}
253
254/* last resort, check if the whole vorwahl+ort set matches a single zip */
255static int sort_by_vo(const void *a, const void *b) {
256 int res;
257 entry_t *ea = (entry_t *)a;
258 entry_t *eb = (entry_t *)b;
259
260 if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res;
261 if ((res = strcmp(ea->ort, eb->ort ))) return res;
262 return 0;
263}
264
265/* For brute forcing name if we can't find strasse */
266static int sort_by_vonvh(const void *a, const void *b) {
267 int res;
268 entry_t *ea = (entry_t *)a;
269 entry_t *eb = (entry_t *)b;
270
271 if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res;
272 if ((res = strcmp(ea->ort, eb->ort ))) return res;
273 if ((res = strcmp(ea->name, eb->name ))) return res;
274 if ((res = strcmp(ea->vorname, eb->vorname))) return res;
275 if ((res = strcmp(ea->hnr, eb->hnr ))) return res;
276 return 0;
277}
278
279static int sort_brutes(const void *a, const void *b) {
280 int res;
281 brute_t *ea = (brute_t *)a;
282 brute_t *eb = (brute_t *)b;
283
284 if ((res = strcmp(ea->vorwahl, eb->vorwahl ))) return res;
285 if ((res = strcmp(ea->ort, eb->ort ))) return res;
286 if ((res = strcmp(ea->strasse_1992, eb->strasse_1992))) return res;
287 return 0;
288}
289
290static int search_brute(const void *a, const void *b) {
291 int res;
292 entry_t *ea = (entry_t*)a;
293 brute_t *eb = (brute_t *)b;
294
295 if ((res = strcmp(ea->vorwahl, eb->vorwahl ))) return res;
296 if ((res = strcmp(ea->ort, eb->ort ))) return res;
297 if ((res = strcmp(ea->strasse, eb->strasse_1992))) return res;
298 return 0;
299}
300
301/* If zip codes end in dots, always chose the one with more info */
302static int test_dot(entry_t * iter, entry_t * found) {
303 char * dot_iter = strchr(iter->plz, '.');
304 char * dot_found = strchr(found->plz, '.');
305 int di = 64, df = 64; // magic value large enough so that the MIN() always choses the other one
306
307 if (!dot_iter && !dot_found) return 0;
308
309 if (dot_iter) di = dot_iter - iter->plz;
310 if (dot_found) df = dot_found - found->plz;
311
312 if (memcmp(iter->plz, found->plz, ((di < df) ? di : df))) return 0;
313
314 if (dot_iter && !dot_found) iter->plz = found->plz;
315 if (!dot_iter && dot_found) found->plz = iter->plz;
316 if (dot_iter && dot_found && di > df) found->plz = iter->plz;
317 if (dot_iter && dot_found && di < df) iter->plz = found->plz;
318
319 return 1;
320}
321
322static entry_t * verify_unique_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)) {
323 entry_t *test_iter = found + 1;
324 entry_t *end = g_book + g_book_size;
325
326 while (test_iter < end && !compar((void*)candidate, (void*)test_iter)) {
327 if (strcmp(test_iter->plz, found->plz)) {
328 if (test_dot(test_iter, found))
329 continue;
330 return 0;
331 }
332 ++test_iter;
333 }
334
335 return found;
336}
337
338static entry_t * verify_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)) {
339 entry_t *test_iter = found + 1;
340 entry_t *end = g_book + g_book_size;
341 char pair[32];
342
343 if (!g_zipmap_size)
344 return verify_unique_zip(found, candidate, compar);
345
346 /* Do we know about the oldzip-newzip mapping? */
347 strcpy(pair, candidate->plz); strcat(pair,"\t"); strcat(pair, found->plz);
348 if (bsearch(pair, g_zipmap, g_zipmap_size, sizeof(char*), cc2))
349 return found;
350
351 while ((test_iter < end) && !compar((void*)candidate, (void*)test_iter)) {
352 if (strcmp(test_iter->plz, found->plz)) {
353 if (test_dot(test_iter, found))
354 continue;
355 strcpy(pair, candidate->plz); strcat(pair,"\t"); strcat(pair, test_iter->plz);
356 if (bsearch(pair, g_zipmap, g_zipmap_size, sizeof(char*), cc2)) {
357 // printf ("FIXED %s -> %s: ", found->plz, test_iter->plz);
358 return test_iter;
359 }
360 }
361 ++test_iter;
362 }
363
364 return found;
365}
366
367static int search_and_verify(entry_t *candidate, int (*compar) (const void *, const void *), int flag) {
368 entry_t *found = bsearch_first(candidate, g_book, g_book_size, sizeof(entry_t), compar);
369
370 if (!found) return 0;
371
372 if (flag == 2)
373 found = verify_unique_zip(found, candidate, compar);
374 else
375 found = verify_zip(found, candidate, compar);
376
377 if (!found)
378 return 0;
379
380 if (flag == 1)
381 fprintf(g_mapfile_out, "%s\t%s\n", candidate->plz, found->plz);
382
383 /* Output plz*/
384 puts(found->plz);
385
386 return 1;
387}
388
diff --git a/src/postprocess/map_plz.h b/src/postprocess/map_plz.h
new file mode 100644
index 0000000..8839b5c
--- /dev/null
+++ b/src/postprocess/map_plz.h
@@ -0,0 +1,39 @@
1#pragma once
2
3typedef struct {
4 const char * vorwahl;
5 const char * ort;
6 const char * strasse;
7 const char * hnr;
8 const char * name;
9 const char * vorname;
10 const char * nummer;
11 const char * plz;
12} entry_t;
13
14typedef struct {
15 const char * vorwahl;
16 const char * ort;
17 const char * strasse_1992;
18 const char * strasse_1995;
19 int similarity;
20 int count;
21} brute_t;
22
23static void *bsearch_first( const void * const key, const void * base, const size_t nel, const size_t width, int (*compar) (const void *, const void *));
24static int sort_by_voshnvn(const void *a, const void *b);
25static int sort_by_voshnv(const void *a, const void *b);
26static int sort_by_vosh(const void *a, const void *b);
27static int sort_by_vos(const void *a, const void *b);
28static int sort_by_vonvh(const void *a, const void *b);
29static int sort_by_vo(const void *a, const void *b);
30static int sort_brutes(const void *a, const void *b);
31static int search_brute(const void *a, const void *b);
32static int test_dot(entry_t * iter, entry_t * found);
33
34static int cc(const void *a, const void *b) { return strcmp(*(char**)a, *(char**)b); }
35static int cc2(const void *a, const void *b) { return strcmp((char*)a, *(char**)b); }
36static entry_t * verify_unique_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *));
37static entry_t * verify_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *));
38static int search_and_verify(entry_t *candidate, int (*compar) (const void *, const void *), int flag);
39static char * advance_and_replace(char **p, char find, char replace);
diff --git a/src/postprocess/postprocess-1992.sh b/src/postprocess/postprocess-1992.sh
new file mode 100644
index 0000000..1e685d2
--- /dev/null
+++ b/src/postprocess/postprocess-1992.sh
@@ -0,0 +1,32 @@
1# Generate file with all relevant columns from 1992
2paste 1992_Q2/{01_Flags,12_Vorwahl,12_Vorwahl_block,11_Ort,07_Strasse,08_Hausnummer,02_Nachname,03_Vorname,13_Rufnummer,10_Postleitzahl_West,10_Zustellamt_PLZOst} > 1992-fvvoshnvrpp.txt
3
4# Generate lookup file from 1995
5paste 1995_Q0/{12_Vorwahl,11_Ort,07_Strasse,08_Hausnummer,02_Nachname,03_Vorname,13_Rufnummer,10_Postleitzahl} | tr '\n\t' '\0' > 1995-voshnvrp.bin
6
7# To debug in lldb
8process launch -i 1992_testfile.txt -- 1995-vorwahl-ort-strasse-hnr-name-vorname-rufnummer-plz.bin
9
10# Compile plz mapper
11cc -O3 -o map_plz map_plz.c -I ../src/export/ ../src/export/mystdlib.c
12
13# outputs mapped plz, generates brutemap.txt
14touch brutemap_input.bin zip_simple_map.bin
15./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 10_Postleitzahl
16
17# generate street name translation table from brutemap,
18# only taking into account similar street names
19# cut -f 3,4 brutemap.txt | tr '[:upper:]' '[:lower:]' | paste brutemap.txt - | cut -f 1-4,6,7 | ./jaro | cut -f 1-5 > brutemap_filtered.txt
20
21# generate street name translation table from brutemap,
22# only taking into account similar street names, new style
23cut -f 3,4 brutemap.txt | python simi.py | paste - brutemap.txt > brutemap_simifiltered.txt
24
25# Sort and prepare similarity filtered files for the merge
26cut -f 1-5 brutemap_simifiltered.txt | sort | uniq -c | sed -E $'s:^ *([[:digit:]]+) :\\1\t:' | tr '\n\t' '\0' > brutemap_input.bin
27
28# compile zipmap into a binary format
29sort -u zip_mapfile.txt | tr '\n' '\0' > zip_simple_map.bin
30
31# Redo the mapping with the data from brutemap and zipmap
32./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 10_Postleitzahl
diff --git a/src/postprocess/simi.py b/src/postprocess/simi.py
new file mode 100755
index 0000000..62ff1ff
--- /dev/null
+++ b/src/postprocess/simi.py
@@ -0,0 +1,11 @@
1#!python
2
3import textdistance
4from sys import stdin
5
6for line in stdin.readlines():
7 x,y = line.split('\t')
8 x = x.casefold()
9 y = y.casefold()
10 v = textdistance.ratcliff_obershelp.normalized_similarity(x,y) + textdistance.jaro_winkler.normalized_similarity(x,y) + textdistance.cosine.normalized_similarity(x,y)
11 print (int(100*(v/3)))