diff options
| author | Dirk Engling <erdgeist@erdgeist.org> | 2019-03-20 04:30:29 +0100 |
|---|---|---|
| committer | Dirk Engling <erdgeist@erdgeist.org> | 2019-03-20 04:30:29 +0100 |
| commit | b4bf8417af0d8ebff2c50570c70fdecaf6a53ed9 (patch) | |
| tree | 4b9341a67c6fc9fd48cae5eecee79ff10ee0fe2f | |
| parent | c4a8bd34b41b2be26426ea01aafc69d41260cce5 (diff) | |
Add code to lookup new zip codes for 1995 entries and fix up streetnames
| -rw-r--r-- | src/postprocess/map_plz.c | 388 | ||||
| -rw-r--r-- | src/postprocess/map_plz.h | 39 | ||||
| -rw-r--r-- | src/postprocess/postprocess-1992.sh | 32 | ||||
| -rwxr-xr-x | src/postprocess/simi.py | 11 |
4 files changed, 470 insertions, 0 deletions
diff --git a/src/postprocess/map_plz.c b/src/postprocess/map_plz.c new file mode 100644 index 0000000..ab0db71 --- /dev/null +++ b/src/postprocess/map_plz.c | |||
| @@ -0,0 +1,388 @@ | |||
| 1 | #include "mystdlib.h" | ||
| 2 | |||
| 3 | #include <stdlib.h> | ||
| 4 | #include <stdio.h> | ||
| 5 | #include <string.h> | ||
| 6 | #include <inttypes.h> | ||
| 7 | #include <assert.h> | ||
| 8 | |||
| 9 | #include "map_plz.h" | ||
| 10 | |||
| 11 | static entry_t *g_book, *g_book_by_name; | ||
| 12 | static size_t g_book_size, g_zipmap_size; | ||
| 13 | static const char * g_zipmap[32000]; | ||
| 14 | static FILE *g_mapfile_out; | ||
| 15 | |||
| 16 | int main(int argc, char **args) { | ||
| 17 | MAP tbuch, zipmap, brutemap; | ||
| 18 | FILE *bfile, *streetfile_out; | ||
| 19 | char *ptr, *input = malloc(65335); | ||
| 20 | char *ort = malloc(65335), vorwahl_block[16]; | ||
| 21 | int i, brutes_count = 0, report = 0; | ||
| 22 | brute_t *brutes = malloc(200000*sizeof(brute_t)); | ||
| 23 | |||
| 24 | /* prepare io */ | ||
| 25 | if (argc != 4) exit(1); | ||
| 26 | tbuch = map_file(args[1], 1); | ||
| 27 | zipmap = map_file(args[2], 1); | ||
| 28 | brutemap = map_file(args[3], 1); | ||
| 29 | |||
| 30 | /* read all files */ | ||
| 31 | g_zipmap[0] = (char*)zipmap->addr; | ||
| 32 | for (i=0; i<zipmap->size; ++i) | ||
| 33 | if (!zipmap->addr[i]) | ||
| 34 | g_zipmap[++g_zipmap_size] = (char*)(zipmap->addr + i + 1); | ||
| 35 | qsort(g_zipmap, g_zipmap_size, sizeof(char*), cc); | ||
| 36 | |||
| 37 | ptr = (char*)brutemap->addr; | ||
| 38 | |||
| 39 | /* Split brute records */ | ||
| 40 | while (ptr < (char*)brutemap->addr + brutemap->size) { | ||
| 41 | brutes[brutes_count].count = atol(ptr); ptr += strlen(ptr) + 1; | ||
| 42 | brutes[brutes_count].similarity = atol(ptr); ptr += strlen(ptr) + 1; | ||
| 43 | |||
| 44 | brutes[brutes_count].vorwahl = ptr; ptr += strlen(ptr) + 1; | ||
| 45 | brutes[brutes_count].ort = ptr; ptr += strlen(ptr) + 1; | ||
| 46 | brutes[brutes_count].strasse_1992 = ptr; ptr += strlen(ptr) + 1; | ||
| 47 | brutes[brutes_count].strasse_1995 = ptr; ptr += strlen(ptr) + 1; | ||
| 48 | |||
| 49 | if (brutes[brutes_count].count >= 20 || brutes[brutes_count].similarity >= 70) | ||
| 50 | brutes_count++; | ||
| 51 | } | ||
| 52 | qsort(brutes, brutes_count, sizeof(brute_t), sort_brutes); | ||
| 53 | |||
| 54 | /* count phonebook lines */ | ||
| 55 | for (i=0; i<tbuch->size; ++i) | ||
| 56 | if (!tbuch->addr[i]) | ||
| 57 | ++g_book_size; | ||
| 58 | |||
| 59 | /* We expect 8 columns per line */ | ||
| 60 | g_book_size /= 8; | ||
| 61 | |||
| 62 | g_book = (entry_t*)malloc(g_book_size * sizeof(entry_t)); | ||
| 63 | g_book_by_name = (entry_t*)malloc(g_book_size * sizeof(entry_t)); | ||
| 64 | |||
| 65 | /* Split pointers into input files into our arrays */ | ||
| 66 | for (i = 0, ptr = (char*)tbuch->addr; i < g_book_size; ++i) { | ||
| 67 | g_book[i].vorwahl = ptr; ptr += strlen(ptr) + 1; | ||
| 68 | g_book[i].ort = ptr; ptr += strlen(ptr) + 1; | ||
| 69 | g_book[i].strasse = ptr; ptr += strlen(ptr) + 1; | ||
| 70 | g_book[i].hnr = ptr; ptr += strlen(ptr) + 1; | ||
| 71 | g_book[i].name = ptr; ptr += strlen(ptr) + 1; | ||
| 72 | g_book[i].vorname = ptr; ptr += strlen(ptr) + 1; | ||
| 73 | g_book[i].nummer = ptr; ptr += strlen(ptr) + 1; | ||
| 74 | g_book[i].plz = ptr; ptr += strlen(ptr) + 1; | ||
| 75 | } | ||
| 76 | /* Copy input array so that it can be sorted by different criteria */ | ||
| 77 | memcpy(g_book_by_name, g_book, g_book_size * sizeof(entry_t)); | ||
| 78 | |||
| 79 | fprintf(stderr, "STEP 1: import done\n"); | ||
| 80 | |||
| 81 | /* Sort the whole thing */ | ||
| 82 | qsort(g_book, g_book_size, sizeof(entry_t), sort_by_voshnvn); | ||
| 83 | qsort(g_book_by_name, g_book_size, sizeof(entry_t), sort_by_vonvh); | ||
| 84 | |||
| 85 | fprintf(stderr, "STEP 2: sort done\n"); | ||
| 86 | |||
| 87 | g_mapfile_out = fopen("zip_mapfile.txt", "w"); | ||
| 88 | streetfile_out = fopen("07_Strasse_fixed", "w"); | ||
| 89 | bfile = fopen("brutemap.txt", "w"); | ||
| 90 | |||
| 91 | while (1) { | ||
| 92 | char flag, *t, *l = fgets(input, 65536, stdin); | ||
| 93 | const char *orig_strasse; | ||
| 94 | entry_t local; | ||
| 95 | brute_t *found_brute; | ||
| 96 | int once = 0, fixed = 0; | ||
| 97 | |||
| 98 | if (!l) break; | ||
| 99 | if ((report++ % 300000) == 0) fprintf(stderr, "% 10d lines done\n", report); | ||
| 100 | |||
| 101 | /* Scan and skip flags */ | ||
| 102 | flag = strtoul(l, &l, 16); ++l; | ||
| 103 | |||
| 104 | /* Copy vorwahl, if in field */ | ||
| 105 | local.vorwahl = advance_and_replace(&l, 9, 0); | ||
| 106 | |||
| 107 | /* Copy over vorwahl for whole exported block, if we're not in cont */ | ||
| 108 | t = advance_and_replace(&l, 9, 0); | ||
| 109 | if (flag < 2) strcpy(vorwahl_block, t); | ||
| 110 | |||
| 111 | /* Only copy over ort from continuations, if present */ | ||
| 112 | if (flag < 2 || (*l != 9) ) local.ort = l; else local.ort = ort; | ||
| 113 | advance_and_replace(&l, 9, 0); | ||
| 114 | |||
| 115 | /* Take copy of ort for continuations, if on start of multi line record */ | ||
| 116 | if (flag == 1) strcpy(ort, local.ort); | ||
| 117 | |||
| 118 | /* Copy rest of the fields verbatim */ | ||
| 119 | local.strasse = advance_and_replace(&l, 9, 0); | ||
| 120 | local.hnr = advance_and_replace(&l, 9, 0); | ||
| 121 | local.name = advance_and_replace(&l, 9, 0); | ||
| 122 | local.vorname = advance_and_replace(&l, 9, 0); | ||
| 123 | local.nummer = advance_and_replace(&l, 9, 0); | ||
| 124 | local.plz = advance_and_replace(&l, 9, ':'); | ||
| 125 | advance_and_replace(&l, 10, 0); | ||
| 126 | |||
| 127 | orig_strasse = local.strasse; | ||
| 128 | rescan: | ||
| 129 | |||
| 130 | if (search_and_verify(&local, sort_by_voshnvn, 1) || | ||
| 131 | search_and_verify(&local, sort_by_voshnv, 1) || | ||
| 132 | search_and_verify(&local, sort_by_vosh, 0) || | ||
| 133 | search_and_verify(&local, sort_by_vos, 0)) { | ||
| 134 | fputs(fixed ? local.strasse : orig_strasse, streetfile_out); | ||
| 135 | fputc(10, streetfile_out); | ||
| 136 | continue; | ||
| 137 | } | ||
| 138 | |||
| 139 | /* If we can't find the street, it might be due to an incorrect vorwahl, try to fix it up */ | ||
| 140 | if (strcmp(local.vorwahl, vorwahl_block)) { | ||
| 141 | local.vorwahl = vorwahl_block; | ||
| 142 | goto rescan; | ||
| 143 | } | ||
| 144 | |||
| 145 | /* If we do have vorwahl + ort + strasse + hnr, but no match, street might have changed name */ | ||
| 146 | if (*local.vorwahl && *local.ort && *local.strasse && *local.hnr && *local.name) { | ||
| 147 | entry_t *found = bsearch_first(&local, g_book_by_name, g_book_size, sizeof(entry_t), sort_by_vonvh); | ||
| 148 | if (found) | ||
| 149 | fprintf(bfile, "%s\t%s\t%s\t%s\t%s\n", local.vorwahl, local.ort, local.strasse, found->strasse, local.hnr); | ||
| 150 | } | ||
| 151 | |||
| 152 | /* See if we can find and correct the street name */ | ||
| 153 | if (!once++) { | ||
| 154 | found_brute = bsearch(&local, brutes, brutes_count, sizeof(brute_t), search_brute); | ||
| 155 | if (found_brute && ( (found_brute->similarity >= 70) || (found_brute->count >= 20)) ) { | ||
| 156 | fixed = found_brute->similarity >= 80; | ||
| 157 | local.strasse = found_brute->strasse_1995; | ||
| 158 | goto rescan; | ||
| 159 | } | ||
| 160 | } | ||
| 161 | |||
| 162 | /* If nothing works, see if the whole village has only one zip, else just print an empty line */ | ||
| 163 | if (!search_and_verify(&local, sort_by_vo, 2)) | ||
| 164 | putchar(10); | ||
| 165 | fputs(orig_strasse, streetfile_out); | ||
| 166 | fputc(10, streetfile_out); | ||
| 167 | } | ||
| 168 | } | ||
| 169 | |||
| 170 | static char * advance_and_replace(char **p, char find, char replace) { | ||
| 171 | char *copy = *p; | ||
| 172 | *p = strchr(*p, find); | ||
| 173 | **p = replace; | ||
| 174 | ++*p; | ||
| 175 | return copy; | ||
| 176 | } | ||
| 177 | |||
| 178 | static void *bsearch_first( const void * const key, const void * base, const size_t nel, const size_t width, int (*compar) (const void *, const void *)) { | ||
| 179 | size_t interval = nel; | ||
| 180 | const void * first = base; | ||
| 181 | int cmp; | ||
| 182 | |||
| 183 | while (interval) { | ||
| 184 | uint8_t *lookat = ((uint8_t*)base) + width * ( interval / 2 ); | ||
| 185 | cmp = compar(key, (void*)lookat); | ||
| 186 | if(cmp == 0 && ((base == first) || compar(key, (void*)(lookat-width)))) | ||
| 187 | return lookat; | ||
| 188 | if(cmp > 0) { | ||
| 189 | base = lookat + width; | ||
| 190 | interval--; | ||
| 191 | } | ||
| 192 | interval /= 2; | ||
| 193 | } | ||
| 194 | |||
| 195 | return 0; | ||
| 196 | } | ||
| 197 | |||
| 198 | /* For if we have vorwahl and ort and strasse */ | ||
| 199 | static int sort_by_voshnvn(const void *a, const void *b) { | ||
| 200 | int res; | ||
| 201 | entry_t *ea = (entry_t *)a; | ||
| 202 | entry_t *eb = (entry_t *)b; | ||
| 203 | |||
| 204 | if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; | ||
| 205 | if ((res = strcmp(ea->ort, eb->ort ))) return res; | ||
| 206 | if ((res = strcmp(ea->strasse, eb->strasse))) return res; | ||
| 207 | if ((res = strcmp(ea->hnr, eb->hnr ))) return res; | ||
| 208 | if ((res = strcmp(ea->name, eb->name ))) return res; | ||
| 209 | if ((res = strcmp(ea->vorname, eb->vorname))) return res; | ||
| 210 | if ((res = strcmp(ea->nummer, eb->nummer ))) return res; | ||
| 211 | return 0; | ||
| 212 | } | ||
| 213 | |||
| 214 | /* more relaxed, if rufnummer missmatches */ | ||
| 215 | static int sort_by_voshnv(const void *a, const void *b) { | ||
| 216 | int res; | ||
| 217 | entry_t *ea = (entry_t *)a; | ||
| 218 | entry_t *eb = (entry_t *)b; | ||
| 219 | |||
| 220 | if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; | ||
| 221 | if ((res = strcmp(ea->ort, eb->ort ))) return res; | ||
| 222 | if ((res = strcmp(ea->strasse, eb->strasse))) return res; | ||
| 223 | if ((res = strcmp(ea->hnr, eb->hnr ))) return res; | ||
| 224 | if ((res = strcmp(ea->name, eb->name ))) return res; | ||
| 225 | if ((res = strcmp(ea->vorname, eb->vorname))) return res; | ||
| 226 | return 0; | ||
| 227 | } | ||
| 228 | |||
| 229 | /* more relaxed, if rufnummer missmatches */ | ||
| 230 | static int sort_by_vosh(const void *a, const void *b) { | ||
| 231 | int res; | ||
| 232 | entry_t *ea = (entry_t *)a; | ||
| 233 | entry_t *eb = (entry_t *)b; | ||
| 234 | |||
| 235 | if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; | ||
| 236 | if ((res = strcmp(ea->ort, eb->ort ))) return res; | ||
| 237 | if ((res = strcmp(ea->strasse, eb->strasse))) return res; | ||
| 238 | if ((res = strcmp(ea->hnr, eb->hnr ))) return res; | ||
| 239 | return 0; | ||
| 240 | } | ||
| 241 | |||
| 242 | /* more relaxed, if rufnummer missmatches */ | ||
| 243 | static int sort_by_vos(const void *a, const void *b) { | ||
| 244 | int res; | ||
| 245 | entry_t *ea = (entry_t *)a; | ||
| 246 | entry_t *eb = (entry_t *)b; | ||
| 247 | |||
| 248 | if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; | ||
| 249 | if ((res = strcmp(ea->ort, eb->ort ))) return res; | ||
| 250 | if ((res = strcmp(ea->strasse, eb->strasse))) return res; | ||
| 251 | return 0; | ||
| 252 | } | ||
| 253 | |||
| 254 | /* last resort, check if the whole vorwahl+ort set matches a single zip */ | ||
| 255 | static int sort_by_vo(const void *a, const void *b) { | ||
| 256 | int res; | ||
| 257 | entry_t *ea = (entry_t *)a; | ||
| 258 | entry_t *eb = (entry_t *)b; | ||
| 259 | |||
| 260 | if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; | ||
| 261 | if ((res = strcmp(ea->ort, eb->ort ))) return res; | ||
| 262 | return 0; | ||
| 263 | } | ||
| 264 | |||
| 265 | /* For brute forcing name if we can't find strasse */ | ||
| 266 | static int sort_by_vonvh(const void *a, const void *b) { | ||
| 267 | int res; | ||
| 268 | entry_t *ea = (entry_t *)a; | ||
| 269 | entry_t *eb = (entry_t *)b; | ||
| 270 | |||
| 271 | if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; | ||
| 272 | if ((res = strcmp(ea->ort, eb->ort ))) return res; | ||
| 273 | if ((res = strcmp(ea->name, eb->name ))) return res; | ||
| 274 | if ((res = strcmp(ea->vorname, eb->vorname))) return res; | ||
| 275 | if ((res = strcmp(ea->hnr, eb->hnr ))) return res; | ||
| 276 | return 0; | ||
| 277 | } | ||
| 278 | |||
| 279 | static int sort_brutes(const void *a, const void *b) { | ||
| 280 | int res; | ||
| 281 | brute_t *ea = (brute_t *)a; | ||
| 282 | brute_t *eb = (brute_t *)b; | ||
| 283 | |||
| 284 | if ((res = strcmp(ea->vorwahl, eb->vorwahl ))) return res; | ||
| 285 | if ((res = strcmp(ea->ort, eb->ort ))) return res; | ||
| 286 | if ((res = strcmp(ea->strasse_1992, eb->strasse_1992))) return res; | ||
| 287 | return 0; | ||
| 288 | } | ||
| 289 | |||
| 290 | static int search_brute(const void *a, const void *b) { | ||
| 291 | int res; | ||
| 292 | entry_t *ea = (entry_t*)a; | ||
| 293 | brute_t *eb = (brute_t *)b; | ||
| 294 | |||
| 295 | if ((res = strcmp(ea->vorwahl, eb->vorwahl ))) return res; | ||
| 296 | if ((res = strcmp(ea->ort, eb->ort ))) return res; | ||
| 297 | if ((res = strcmp(ea->strasse, eb->strasse_1992))) return res; | ||
| 298 | return 0; | ||
| 299 | } | ||
| 300 | |||
| 301 | /* If zip codes end in dots, always chose the one with more info */ | ||
| 302 | static int test_dot(entry_t * iter, entry_t * found) { | ||
| 303 | char * dot_iter = strchr(iter->plz, '.'); | ||
| 304 | char * dot_found = strchr(found->plz, '.'); | ||
| 305 | int di = 64, df = 64; // magic value large enough so that the MIN() always choses the other one | ||
| 306 | |||
| 307 | if (!dot_iter && !dot_found) return 0; | ||
| 308 | |||
| 309 | if (dot_iter) di = dot_iter - iter->plz; | ||
| 310 | if (dot_found) df = dot_found - found->plz; | ||
| 311 | |||
| 312 | if (memcmp(iter->plz, found->plz, ((di < df) ? di : df))) return 0; | ||
| 313 | |||
| 314 | if (dot_iter && !dot_found) iter->plz = found->plz; | ||
| 315 | if (!dot_iter && dot_found) found->plz = iter->plz; | ||
| 316 | if (dot_iter && dot_found && di > df) found->plz = iter->plz; | ||
| 317 | if (dot_iter && dot_found && di < df) iter->plz = found->plz; | ||
| 318 | |||
| 319 | return 1; | ||
| 320 | } | ||
| 321 | |||
| 322 | static entry_t * verify_unique_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)) { | ||
| 323 | entry_t *test_iter = found + 1; | ||
| 324 | entry_t *end = g_book + g_book_size; | ||
| 325 | |||
| 326 | while (test_iter < end && !compar((void*)candidate, (void*)test_iter)) { | ||
| 327 | if (strcmp(test_iter->plz, found->plz)) { | ||
| 328 | if (test_dot(test_iter, found)) | ||
| 329 | continue; | ||
| 330 | return 0; | ||
| 331 | } | ||
| 332 | ++test_iter; | ||
| 333 | } | ||
| 334 | |||
| 335 | return found; | ||
| 336 | } | ||
| 337 | |||
| 338 | static entry_t * verify_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)) { | ||
| 339 | entry_t *test_iter = found + 1; | ||
| 340 | entry_t *end = g_book + g_book_size; | ||
| 341 | char pair[32]; | ||
| 342 | |||
| 343 | if (!g_zipmap_size) | ||
| 344 | return verify_unique_zip(found, candidate, compar); | ||
| 345 | |||
| 346 | /* Do we know about the oldzip-newzip mapping? */ | ||
| 347 | strcpy(pair, candidate->plz); strcat(pair,"\t"); strcat(pair, found->plz); | ||
| 348 | if (bsearch(pair, g_zipmap, g_zipmap_size, sizeof(char*), cc2)) | ||
| 349 | return found; | ||
| 350 | |||
| 351 | while ((test_iter < end) && !compar((void*)candidate, (void*)test_iter)) { | ||
| 352 | if (strcmp(test_iter->plz, found->plz)) { | ||
| 353 | if (test_dot(test_iter, found)) | ||
| 354 | continue; | ||
| 355 | strcpy(pair, candidate->plz); strcat(pair,"\t"); strcat(pair, test_iter->plz); | ||
| 356 | if (bsearch(pair, g_zipmap, g_zipmap_size, sizeof(char*), cc2)) { | ||
| 357 | // printf ("FIXED %s -> %s: ", found->plz, test_iter->plz); | ||
| 358 | return test_iter; | ||
| 359 | } | ||
| 360 | } | ||
| 361 | ++test_iter; | ||
| 362 | } | ||
| 363 | |||
| 364 | return found; | ||
| 365 | } | ||
| 366 | |||
| 367 | static int search_and_verify(entry_t *candidate, int (*compar) (const void *, const void *), int flag) { | ||
| 368 | entry_t *found = bsearch_first(candidate, g_book, g_book_size, sizeof(entry_t), compar); | ||
| 369 | |||
| 370 | if (!found) return 0; | ||
| 371 | |||
| 372 | if (flag == 2) | ||
| 373 | found = verify_unique_zip(found, candidate, compar); | ||
| 374 | else | ||
| 375 | found = verify_zip(found, candidate, compar); | ||
| 376 | |||
| 377 | if (!found) | ||
| 378 | return 0; | ||
| 379 | |||
| 380 | if (flag == 1) | ||
| 381 | fprintf(g_mapfile_out, "%s\t%s\n", candidate->plz, found->plz); | ||
| 382 | |||
| 383 | /* Output plz*/ | ||
| 384 | puts(found->plz); | ||
| 385 | |||
| 386 | return 1; | ||
| 387 | } | ||
| 388 | |||
diff --git a/src/postprocess/map_plz.h b/src/postprocess/map_plz.h new file mode 100644 index 0000000..8839b5c --- /dev/null +++ b/src/postprocess/map_plz.h | |||
| @@ -0,0 +1,39 @@ | |||
| 1 | #pragma once | ||
| 2 | |||
| 3 | typedef struct { | ||
| 4 | const char * vorwahl; | ||
| 5 | const char * ort; | ||
| 6 | const char * strasse; | ||
| 7 | const char * hnr; | ||
| 8 | const char * name; | ||
| 9 | const char * vorname; | ||
| 10 | const char * nummer; | ||
| 11 | const char * plz; | ||
| 12 | } entry_t; | ||
| 13 | |||
| 14 | typedef struct { | ||
| 15 | const char * vorwahl; | ||
| 16 | const char * ort; | ||
| 17 | const char * strasse_1992; | ||
| 18 | const char * strasse_1995; | ||
| 19 | int similarity; | ||
| 20 | int count; | ||
| 21 | } brute_t; | ||
| 22 | |||
| 23 | static void *bsearch_first( const void * const key, const void * base, const size_t nel, const size_t width, int (*compar) (const void *, const void *)); | ||
| 24 | static int sort_by_voshnvn(const void *a, const void *b); | ||
| 25 | static int sort_by_voshnv(const void *a, const void *b); | ||
| 26 | static int sort_by_vosh(const void *a, const void *b); | ||
| 27 | static int sort_by_vos(const void *a, const void *b); | ||
| 28 | static int sort_by_vonvh(const void *a, const void *b); | ||
| 29 | static int sort_by_vo(const void *a, const void *b); | ||
| 30 | static int sort_brutes(const void *a, const void *b); | ||
| 31 | static int search_brute(const void *a, const void *b); | ||
| 32 | static int test_dot(entry_t * iter, entry_t * found); | ||
| 33 | |||
| 34 | static int cc(const void *a, const void *b) { return strcmp(*(char**)a, *(char**)b); } | ||
| 35 | static int cc2(const void *a, const void *b) { return strcmp((char*)a, *(char**)b); } | ||
| 36 | static entry_t * verify_unique_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)); | ||
| 37 | static entry_t * verify_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)); | ||
| 38 | static int search_and_verify(entry_t *candidate, int (*compar) (const void *, const void *), int flag); | ||
| 39 | static char * advance_and_replace(char **p, char find, char replace); | ||
diff --git a/src/postprocess/postprocess-1992.sh b/src/postprocess/postprocess-1992.sh new file mode 100644 index 0000000..1e685d2 --- /dev/null +++ b/src/postprocess/postprocess-1992.sh | |||
| @@ -0,0 +1,32 @@ | |||
| 1 | # Generate file with all relevant columns from 1992 | ||
| 2 | paste 1992_Q2/{01_Flags,12_Vorwahl,12_Vorwahl_block,11_Ort,07_Strasse,08_Hausnummer,02_Nachname,03_Vorname,13_Rufnummer,10_Postleitzahl_West,10_Zustellamt_PLZOst} > 1992-fvvoshnvrpp.txt | ||
| 3 | |||
| 4 | # Generate lookup file from 1995 | ||
| 5 | paste 1995_Q0/{12_Vorwahl,11_Ort,07_Strasse,08_Hausnummer,02_Nachname,03_Vorname,13_Rufnummer,10_Postleitzahl} | tr '\n\t' '\0' > 1995-voshnvrp.bin | ||
| 6 | |||
| 7 | # To debug in lldb | ||
| 8 | process launch -i 1992_testfile.txt -- 1995-vorwahl-ort-strasse-hnr-name-vorname-rufnummer-plz.bin | ||
| 9 | |||
| 10 | # Compile plz mapper | ||
| 11 | cc -O3 -o map_plz map_plz.c -I ../src/export/ ../src/export/mystdlib.c | ||
| 12 | |||
| 13 | # outputs mapped plz, generates brutemap.txt | ||
| 14 | touch brutemap_input.bin zip_simple_map.bin | ||
| 15 | ./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 10_Postleitzahl | ||
| 16 | |||
| 17 | # generate street name translation table from brutemap, | ||
| 18 | # only taking into account similar street names | ||
| 19 | # cut -f 3,4 brutemap.txt | tr '[:upper:]' '[:lower:]' | paste brutemap.txt - | cut -f 1-4,6,7 | ./jaro | cut -f 1-5 > brutemap_filtered.txt | ||
| 20 | |||
| 21 | # generate street name translation table from brutemap, | ||
| 22 | # only taking into account similar street names, new style | ||
| 23 | cut -f 3,4 brutemap.txt | python simi.py | paste - brutemap.txt > brutemap_simifiltered.txt | ||
| 24 | |||
| 25 | # Sort and prepare similarity filtered files for the merge | ||
| 26 | cut -f 1-5 brutemap_simifiltered.txt | sort | uniq -c | sed -E $'s:^ *([[:digit:]]+) :\\1\t:' | tr '\n\t' '\0' > brutemap_input.bin | ||
| 27 | |||
| 28 | # compile zipmap into a binary format | ||
| 29 | sort -u zip_mapfile.txt | tr '\n' '\0' > zip_simple_map.bin | ||
| 30 | |||
| 31 | # Redo the mapping with the data from brutemap and zipmap | ||
| 32 | ./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 10_Postleitzahl | ||
diff --git a/src/postprocess/simi.py b/src/postprocess/simi.py new file mode 100755 index 0000000..62ff1ff --- /dev/null +++ b/src/postprocess/simi.py | |||
| @@ -0,0 +1,11 @@ | |||
| 1 | #!python | ||
| 2 | |||
| 3 | import textdistance | ||
| 4 | from sys import stdin | ||
| 5 | |||
| 6 | for line in stdin.readlines(): | ||
| 7 | x,y = line.split('\t') | ||
| 8 | x = x.casefold() | ||
| 9 | y = y.casefold() | ||
| 10 | v = textdistance.ratcliff_obershelp.normalized_similarity(x,y) + textdistance.jaro_winkler.normalized_similarity(x,y) + textdistance.cosine.normalized_similarity(x,y) | ||
| 11 | print (int(100*(v/3))) | ||
