diff options
author | Dirk Engling <erdgeist@erdgeist.org> | 2019-03-07 15:24:32 +0100 |
---|---|---|
committer | Dirk Engling <erdgeist@erdgeist.org> | 2019-03-07 15:24:32 +0100 |
commit | 0e52aaa51092c5535cd4e2686bc5c15f7f7b9c5b (patch) | |
tree | 60e35c2ca6bc72144681820f72ee982e5558642a /src | |
parent | 2eff180815619302f101a4ff32db6cf293b529c8 (diff) |
Rework merge_entries to account for chunks moving between Zusaetze and Verweise column
Diffstat (limited to 'src')
-rw-r--r-- | src/postprocess/merge_entries.c | 268 |
1 files changed, 123 insertions, 145 deletions
diff --git a/src/postprocess/merge_entries.c b/src/postprocess/merge_entries.c index 3ebfa8c..f89113e 100644 --- a/src/postprocess/merge_entries.c +++ b/src/postprocess/merge_entries.c | |||
@@ -6,17 +6,21 @@ | |||
6 | #include <inttypes.h> | 6 | #include <inttypes.h> |
7 | #include <assert.h> | 7 | #include <assert.h> |
8 | 8 | ||
9 | extern int halfsiphash(const uint8_t *in, const size_t inlen, const uint8_t *k, uint8_t *out, const size_t outlen); | ||
10 | |||
9 | enum { COLUMNS = 15 }; | 11 | enum { COLUMNS = 15 }; |
10 | typedef struct { | 12 | typedef struct { |
11 | char *ptr; | ||
12 | long rows; | 13 | long rows; |
13 | long outoff; | 14 | long outoff; |
14 | long flag; | 15 | long flag; |
16 | int year; | ||
15 | } entry_t; | 17 | } entry_t; |
16 | typedef struct { | 18 | typedef struct { |
17 | char *ptr; | 19 | char *ptr; |
18 | size_t size; | 20 | size_t size; |
21 | uint64_t data; /* might store precision or normalized verweise+zusaetze */ | ||
19 | } outvec_t; | 22 | } outvec_t; |
23 | static outvec_t * g_out_array; | ||
20 | 24 | ||
21 | const char *g_year_map[] = { | 25 | const char *g_year_map[] = { |
22 | "1992_Q2", "1995_Q0", "1996_Q0", "1996_Q1", "1997_Q1", "1997_Q3", "1998_Q1", "1998_Q3", "1999_Q1", "1999_Q3", "2000_Q1", "2000_Q3", "2001_Q1", "2001_Q2", "2001_Q3", "2001_Q4", "2002_Q1", | 26 | "1992_Q2", "1995_Q0", "1996_Q0", "1996_Q1", "1997_Q1", "1997_Q3", "1998_Q1", "1998_Q3", "1999_Q1", "1999_Q3", "2000_Q1", "2000_Q3", "2001_Q1", "2001_Q2", "2001_Q3", "2001_Q4", "2002_Q1", |
@@ -25,11 +29,7 @@ const char *g_year_map[] = { | |||
25 | 0 | 29 | 0 |
26 | }; | 30 | }; |
27 | 31 | ||
28 | void SKIP_1_COLUMN(char **ptr) { *ptr = strchr(*ptr, 10) + 1; } | 32 | static int year_to_offset(const char *year) { |
29 | void SKIP_2_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); } | ||
30 | void SKIP_3_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); } | ||
31 | |||
32 | int year_to_offset(const char *year) { | ||
33 | const char **y = g_year_map; | 33 | const char **y = g_year_map; |
34 | int off = 0; | 34 | int off = 0; |
35 | while (*y) { | 35 | while (*y) { |
@@ -39,8 +39,33 @@ int year_to_offset(const char *year) { | |||
39 | return -1; | 39 | return -1; |
40 | } | 40 | } |
41 | 41 | ||
42 | static uint64_t string_to_hash(const char *start, const char *end) { | ||
43 | const uint64_t k = 0xdc082e4772c897e7LL; | ||
44 | uint64_t hash = 1LL, acc = 0LL; | ||
45 | while (start < end) { | ||
46 | const char *tokend = memchr(start, ' ', end - start); | ||
47 | const char *tokend2 = memchr(start, '.', end - start); | ||
48 | const char *compare_end = tokend; | ||
49 | |||
50 | if (tokend2 && (!tokend || (tokend > tokend2))) { | ||
51 | tokend = 0; | ||
52 | compare_end = tokend2 + 1; | ||
53 | } | ||
54 | if (!tokend && !tokend2) | ||
55 | compare_end = end; | ||
42 | 56 | ||
43 | int | 57 | if (compare_end != start) { |
58 | halfsiphash((const uint8_t*)start, (const size_t)(compare_end - start), (const uint8_t*)&k, (uint8_t*)&acc, sizeof(acc)); | ||
59 | hash *= acc; | ||
60 | //printf("HASH %" PRIX64 " %" PRIX64 ":%" PRIX64 " TOKEN(%zd): %.*s\n", hash, acc, k, tokend - start, (int)(tokend - start), start); | ||
61 | } | ||
62 | start = compare_end; | ||
63 | if (tokend) start++; // for space, we only compared up to the char | ||
64 | } | ||
65 | return hash; | ||
66 | } | ||
67 | |||
68 | static int | ||
44 | STRCMP_n (const char *p1, const char *p2) | 69 | STRCMP_n (const char *p1, const char *p2) |
45 | { | 70 | { |
46 | const unsigned char *s1 = (const unsigned char *) p1; | 71 | const unsigned char *s1 = (const unsigned char *) p1; |
@@ -57,103 +82,85 @@ STRCMP_n (const char *p1, const char *p2) | |||
57 | return c1 - c2; | 82 | return c1 - c2; |
58 | } | 83 | } |
59 | 84 | ||
60 | int compare_entries(entry_t*a, entry_t*b, int *prec) { | 85 | static int compare_entries(entry_t*a, entry_t*b) { |
61 | char *pa = a->ptr, *pb = b->ptr; | 86 | int col, row, nprec_a, nprec_b, a_is_newer; |
62 | int col, row, res = 0, nprec = -1; | 87 | outvec_t *oa = g_out_array + a->outoff; |
88 | outvec_t *ob = g_out_array + b->outoff; | ||
63 | 89 | ||
64 | /* Multi line entries never match single line entries */ | 90 | /* Multi line entries never match single line entries */ |
65 | if (a->rows != b->rows) | 91 | if (a->rows != b->rows) |
66 | return -1; | 92 | return -1; |
67 | 93 | ||
68 | /* Assume house number precision first .. unless */ | 94 | /* Check all columns except year, flag and coords for equality */ |
69 | if (!memcmp(pa,"2006_Q3",7)) | 95 | for (row=0; row <= a->rows; ++row) |
70 | *prec = 2; | 96 | for (col=2; col<COLUMNS-1; ++col) { |
71 | else | 97 | int off = COLUMNS*row+col; |
72 | *prec = 3; | 98 | if (col == 5) continue; |
73 | 99 | if (col == 8) { | |
74 | if (!memcmp(pb,"2006_Q3",7)) | 100 | if (oa[off].data != ob[off].data) |
75 | nprec = 2; | 101 | return -1; |
76 | else | 102 | else |
77 | nprec = 3; | 103 | continue; |
78 | 104 | } | |
79 | /* Skip year and flags */ | 105 | if (oa[off].size != ob[off].size || memcmp(oa[off].ptr, ob[off].ptr, oa[off].size)) |
80 | SKIP_2_COLUMNS(&pa); | 106 | return -1; |
81 | SKIP_2_COLUMNS(&pb); | 107 | } |
82 | |||
83 | /* Test all columns for identity */ | ||
84 | for (col=2; col<COLUMNS-1; ++col) { | ||
85 | if (!res && STRCMP_n(pa, pb)) | ||
86 | res = -1; | ||
87 | SKIP_1_COLUMN(&pa); | ||
88 | SKIP_1_COLUMN(&pb); | ||
89 | } | ||
90 | |||
91 | /* If no coords, downgrade precision */ | ||
92 | if (*pa == 9) *prec = 1; | ||
93 | if (*pb == 9) nprec = 1; | ||
94 | 108 | ||
95 | /* If entries differ, return after precision has been found */ | 109 | /* we haven't returned, so they're identical enough |
96 | if (res) return res; | 110 | we always want to keep the newest coordinates, |
111 | unless they are not present or from 2006_Q3 */ | ||
112 | a_is_newer = memcmp(oa[0].ptr,ob[0].ptr,oa[0].size) > 0; | ||
113 | nprec_a = memcmp(oa[0].ptr,"2006_Q3",7) ? 2 : 1; | ||
114 | nprec_b = memcmp(ob[0].ptr,"2006_Q3",7) ? 2 : 1; | ||
97 | 115 | ||
98 | /* Only if precision is the same, difference in coordinates | 116 | for (row=0; row <= a->rows; ++row) { |
99 | is significant. | 117 | int present_a = oa[row * COLUMNS + 14].size != 1; |
100 | if (*prec == nprec && STRCMP_n(pa, pb)) | 118 | int present_b = ob[row * COLUMNS + 14].size != 1; |
101 | return -1; */ | ||
102 | 119 | ||
103 | /* Row 1 has been compared, check the rest of lines */ | 120 | if (!present_a) continue; |
104 | for (row=0; row<a->rows; ++row) { | ||
105 | 121 | ||
106 | /* Skip last row's coordinate columns, year and flags */ | 122 | /* If the current entry's coords were copied, use the stored precision */ |
107 | SKIP_3_COLUMNS(&pa); | 123 | if (oa[row * COLUMNS + 14].data > 0) |
108 | SKIP_3_COLUMNS(&pb); | 124 | nprec_a = oa[row * COLUMNS + 14].data; |
109 | 125 | ||
110 | for (col=2; col<COLUMNS-1; ++col) { | 126 | if (!present_b || (nprec_a > nprec_b ) || ( a_is_newer && (nprec_a >= nprec_b))) { |
111 | if (STRCMP_n(pa, pb)) | 127 | ob[row * COLUMNS + 14].ptr = oa[row * COLUMNS + 14].ptr; |
112 | return -1; | 128 | ob[row * COLUMNS + 14].size = oa[row * COLUMNS + 14].size; |
113 | SKIP_1_COLUMN(&pa); | 129 | ob[row * COLUMNS + 14].data = nprec_a; |
114 | SKIP_1_COLUMN(&pb); | ||
115 | } | 130 | } |
116 | |||
117 | /* Only if precision is the same, difference in coordinates | ||
118 | is significant. | ||
119 | if (*prec == nprec && STRCMP_n(pa, pb)) | ||
120 | return -1; */ | ||
121 | } | 131 | } |
122 | return 0; | 132 | return 0; |
123 | } | 133 | } |
124 | 134 | ||
125 | int sort_me(const void *f_a, const void *f_b) { | 135 | static int sort_me(const void *f_a, const void *f_b) { |
126 | entry_t *e_a = (entry_t *)f_a; | 136 | entry_t *e_a = (entry_t *)f_a; |
127 | entry_t *e_b = (entry_t *)f_b; | 137 | entry_t *e_b = (entry_t *)f_b; |
128 | 138 | ||
129 | char * pa = (char*)e_a->ptr; | 139 | outvec_t *oa = g_out_array + e_a->outoff; |
130 | char * pb = (char*)e_b->ptr; | 140 | outvec_t *ob = g_out_array + e_b->outoff; |
131 | 141 | ||
132 | int results[COLUMNS], c; | 142 | int res, row; |
133 | 143 | ||
134 | if (e_a->rows != e_b->rows) | 144 | if (e_a->rows != e_b->rows) |
135 | return e_a->rows - e_b->rows; | 145 | return e_a->rows - e_b->rows; |
136 | 146 | ||
137 | for (c = 0; c<COLUMNS; ++c) { | 147 | for (row = 0; row <= e_a->rows; ++row) { |
138 | results[c] = STRCMP_n(pa, pb); | 148 | outvec_t *oa_row = oa + row * COLUMNS; |
139 | SKIP_1_COLUMN(&pa); | 149 | outvec_t *ob_row = ob + row * COLUMNS; |
140 | SKIP_1_COLUMN(&pb); | 150 | |
151 | if ((res = STRCMP_n(oa_row[10].ptr, ob_row[10].ptr))) return res; /* Vorwahl */ | ||
152 | if ((res = STRCMP_n(oa_row[11].ptr, ob_row[11].ptr))) return res; /* Rufnummer */ | ||
153 | if ((res = STRCMP_n(oa_row[ 2].ptr, ob_row[ 2].ptr))) return res; /* PLZ */ | ||
154 | if ((res = STRCMP_n(oa_row[ 6].ptr, ob_row[ 6].ptr))) return res; /* Strasse */ | ||
155 | if ((res = STRCMP_n(oa_row[ 7].ptr, ob_row[ 7].ptr))) return res; /* Hausnummer */ | ||
156 | if ((res = STRCMP_n(oa_row[ 3].ptr, ob_row[ 3].ptr))) return res; /* Nachname */ | ||
157 | if ((res = STRCMP_n(oa_row[ 4].ptr, ob_row[ 4].ptr))) return res; /* Vorname */ | ||
158 | if (oa_row[8].data != ob_row[8].data ) return oa_row[8].data - ob_row[8].data; | ||
141 | } | 159 | } |
142 | 160 | ||
143 | if (results[10]) return results[10]; /* Vorwahl */ | 161 | return STRCMP_n(oa[0].ptr, ob[0].ptr); |
144 | if (results[11]) return results[11]; /* Rufnummer */ | ||
145 | if (results[2]) return results[2]; /* PLZ */ | ||
146 | if (results[3]) return results[3]; /* Nachname */ | ||
147 | if (results[4]) return results[4]; /* Vorname */ | ||
148 | if (results[6]) return results[6]; /* Strasse */ | ||
149 | if (results[7]) return results[7]; /* Hausnummer */ | ||
150 | if (results[8]) return results[8]; /* Verweise */ | ||
151 | if (results[0]) return results[0]; /* Year */ | ||
152 | return 0; | ||
153 | } | 162 | } |
154 | 163 | ||
155 | enum { OUTPUT_BUFFER_SIZE = 1024*1024*128 }; | ||
156 | |||
157 | static void do_escape_string(char * s, size_t len) { | 164 | static void do_escape_string(char * s, size_t len) { |
158 | size_t i; | 165 | size_t i; |
159 | 166 | ||
@@ -187,10 +194,9 @@ static void escape_string(char * s, size_t len) { | |||
187 | 194 | ||
188 | int main(int argc, char **args) { | 195 | int main(int argc, char **args) { |
189 | MAP tbuch = map_file(args[1], 1); | 196 | MAP tbuch = map_file(args[1], 1); |
190 | char *ptr, *start; | 197 | char *ptr; |
191 | entry_t * sort_array; | 198 | entry_t * sort_array; |
192 | outvec_t * out_array; | 199 | int current = -1, outoff = 0, lines = COLUMNS, i; |
193 | int current = -1, outoff = 0, lines = 1, i, truth = 0, truth_prec = -1; | ||
194 | uint64_t year_list = 0, revflag_list = 0, bizflag_list = 0; | 200 | uint64_t year_list = 0, revflag_list = 0, bizflag_list = 0; |
195 | long flag = 0; | 201 | long flag = 0; |
196 | 202 | ||
@@ -199,24 +205,29 @@ int main(int argc, char **args) { | |||
199 | if (tbuch->addr[i] == 10) | 205 | if (tbuch->addr[i] == 10) |
200 | ++lines; | 206 | ++lines; |
201 | 207 | ||
202 | sort_array = (entry_t*)malloc((lines / COLUMNS) * sizeof(entry_t)); | 208 | g_out_array = (outvec_t*)malloc(lines * sizeof(outvec_t)); |
203 | out_array = (outvec_t*)malloc((lines / COLUMNS) * sizeof(outvec_t)); | 209 | sort_array = (entry_t*)malloc(lines * sizeof(entry_t)); |
204 | 210 | ||
205 | ptr = (char*)tbuch->addr; | 211 | ptr = (char*)tbuch->addr; |
206 | start = ptr; | ||
207 | 212 | ||
208 | while (ptr < (char*)tbuch->addr + tbuch->size) { | 213 | while (ptr < (char*)tbuch->addr + tbuch->size) { |
209 | int c; | 214 | int c, year; |
210 | |||
211 | start = ptr; | ||
212 | 215 | ||
213 | /* Look for field terminator */ | 216 | /* Look for field terminator */ |
214 | for (c=0; c<COLUMNS; ++c) { | 217 | for (c=0; c<COLUMNS; ++c) { |
215 | char * end = strchr(ptr, 10); | 218 | char * end = strchr(ptr, 10); |
216 | if (c==1) { | 219 | if (c==0) |
220 | year = year_to_offset(ptr); | ||
221 | if (c==1) | ||
217 | flag = strtoul(ptr, 0, 16); | 222 | flag = strtoul(ptr, 0, 16); |
218 | out_array[outoff].ptr = end + 1; | 223 | if (c==5) |
219 | } | 224 | g_out_array[outoff+8].data = string_to_hash(ptr, end); |
225 | g_out_array[outoff+c].ptr = ptr; | ||
226 | g_out_array[outoff+c].size = end - ptr; | ||
227 | if (c==8) | ||
228 | g_out_array[outoff+c].data *= string_to_hash(ptr, end); | ||
229 | else | ||
230 | g_out_array[outoff+c].data = 0; | ||
220 | ptr = end + 1; | 231 | ptr = end + 1; |
221 | } | 232 | } |
222 | 233 | ||
@@ -224,54 +235,29 @@ int main(int argc, char **args) { | |||
224 | assert( current >= 0); | 235 | assert( current >= 0); |
225 | sort_array[current].rows++; | 236 | sort_array[current].rows++; |
226 | } else { | 237 | } else { |
227 | sort_array[++current].ptr = start; | 238 | sort_array[++current].rows = 0; |
228 | sort_array[current].rows = 0; | ||
229 | sort_array[current].outoff = outoff; | 239 | sort_array[current].outoff = outoff; |
230 | sort_array[current].flag = flag; | 240 | sort_array[current].flag = flag; |
241 | sort_array[current].year = year; | ||
231 | } | 242 | } |
232 | out_array[outoff].size = ptr - out_array[outoff].ptr; | 243 | outoff += COLUMNS; |
233 | outoff++; | ||
234 | } | 244 | } |
235 | 245 | ||
236 | /* Sort the whole thing */ | 246 | /* Sort the whole thing */ |
237 | qsort(sort_array, current, sizeof(entry_t), sort_me); | 247 | qsort(sort_array, current, sizeof(entry_t), sort_me); |
238 | 248 | ||
239 | for (i=0; i<=current; ++i) { | 249 | for (i=0; i<=current; ++i) { |
240 | int j, dump = 0, prec; | 250 | year_list |= 1LL << sort_array[i].year; |
241 | 251 | if (sort_array[i].flag & 0x80 ) bizflag_list |= 1LL << sort_array[i].year; | |
242 | int year = year_to_offset(sort_array[i].ptr); | 252 | if (sort_array[i].flag & 0x40 ) revflag_list |= 1LL << sort_array[i].year; |
243 | |||
244 | year_list |= 1LL << year; | ||
245 | if (sort_array[i].flag & 0x80 ) bizflag_list |= 1LL << year; | ||
246 | if (sort_array[i].flag & 0x40 ) revflag_list |= 1LL << year; | ||
247 | |||
248 | /* The last entry always needs to be dumped, but check if its | ||
249 | precision is better than the old truth's | ||
250 | The second comparision checks for equality of entries (modulo | ||
251 | coordinate mismatch) | ||
252 | */ | ||
253 | if (i == current) { | ||
254 | compare_entries(sort_array+i, sort_array+i, &prec); | ||
255 | dump = 1; | ||
256 | } else if (compare_entries(sort_array+i, sort_array+i+1, &prec)) | ||
257 | dump = 1; | ||
258 | |||
259 | /* If this entry's precision is higher than the one of possible | ||
260 | earlier matches, then the current entry becomes the truth */ | ||
261 | if (prec >= truth_prec) { | ||
262 | truth = i; | ||
263 | truth_prec = prec; | ||
264 | } | ||
265 | 253 | ||
266 | if (dump) { | 254 | if ((i == current) || compare_entries(sort_array+i, sort_array+i+1)) { |
267 | printf("%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t", year_list, bizflag_list, revflag_list); | 255 | printf("%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t", year_list, bizflag_list, revflag_list); |
268 | for (int c=0; c<COLUMNS-2; ++c) { | 256 | for (int c=2; c<COLUMNS; ++c) { |
269 | outvec_t * out = out_array + sort_array[truth].outoff; | 257 | int j, started = 0, skipped = 0; |
270 | int started = 0, skipped = 0; | 258 | for (j=0; j<=sort_array[i].rows; ++j) { |
271 | for (j=0; j<=sort_array[truth].rows; ++j) { | 259 | outvec_t * out = g_out_array + sort_array[i].outoff + j * COLUMNS; |
272 | char *s = strchr(out->ptr, 10); | 260 | if (!out[c].size || *(out[c].ptr) == 9) |
273 | size_t len = s - out->ptr; | ||
274 | if (!len || out->ptr[0] == 9) | ||
275 | skipped++; | 261 | skipped++; |
276 | else { | 262 | else { |
277 | if (!started++) | 263 | if (!started++) |
@@ -279,32 +265,24 @@ int main(int argc, char **args) { | |||
279 | else | 265 | else |
280 | putchar(','); | 266 | putchar(','); |
281 | for (int x=0; x<skipped; ++x) fputs("null,", stdout); | 267 | for (int x=0; x<skipped; ++x) fputs("null,", stdout); |
282 | if (c != COLUMNS-3) | 268 | if (c != COLUMNS-1) |
283 | escape_string(out->ptr, len); | 269 | escape_string(out[c].ptr, out[c].size); |
284 | else { | 270 | else { |
285 | char coords[64], *tab; | 271 | char coords[64], *tab; |
286 | // memcpy(coords, "POINT(", 6); | 272 | memcpy(coords, out[c].ptr, out[c].size); |
287 | // memcpy(coords + 6, out->ptr, len); | 273 | tab = memchr(coords, 9, out[c].size); |
288 | // tab = memchr(coords + 6, 9, len); | 274 | if (tab) *tab = ' '; |
289 | // if (tab) *tab = ' '; | 275 | fwrite(coords, out[c].size, 1, stdout); |
290 | // coords[6+len] = ')'; | ||
291 | // fwrite(coords, 7 + len, 1, stdout); | ||
292 | memcpy(coords, out->ptr, len); | ||
293 | tab = memchr(coords, 9, len); | ||
294 | if (tab) *tab = ' '; | ||
295 | fwrite(coords, len, 1, stdout); | ||
296 | } | 276 | } |
297 | skipped = 0; | 277 | skipped = 0; |
298 | } | 278 | } |
299 | out->ptr = s + 1; | ||
300 | ++out; | 279 | ++out; |
301 | } | 280 | } |
302 | if (started) putchar('}'); | 281 | if (started) putchar('}'); |
303 | if (c<COLUMNS-3) putchar(9); | 282 | if (c<COLUMNS-1) putchar(9); |
304 | } | 283 | } |
305 | putchar(10); | 284 | putchar(10); |
306 | 285 | ||
307 | truth_prec = -1; | ||
308 | year_list = 0; | 286 | year_list = 0; |
309 | bizflag_list = 0; | 287 | bizflag_list = 0; |
310 | revflag_list = 0; | 288 | revflag_list = 0; |