diff options
-rw-r--r-- | src/postprocess/merge_entries.c | 313 | ||||
-rw-r--r-- | src/postprocess/postgres.txt | 41 | ||||
-rw-r--r-- | src/postprocess/postprocess.sh | 18 | ||||
-rw-r--r-- | src/postprocess/sort_plz.c | 4 |
4 files changed, 374 insertions, 2 deletions
diff --git a/src/postprocess/merge_entries.c b/src/postprocess/merge_entries.c new file mode 100644 index 0000000..3ebfa8c --- /dev/null +++ b/src/postprocess/merge_entries.c | |||
@@ -0,0 +1,313 @@ | |||
1 | #include "mystdlib.h" | ||
2 | |||
3 | #include <stdlib.h> | ||
4 | #include <stdio.h> | ||
5 | #include <string.h> | ||
6 | #include <inttypes.h> | ||
7 | #include <assert.h> | ||
8 | |||
9 | enum { COLUMNS = 15 }; | ||
10 | typedef struct { | ||
11 | char *ptr; | ||
12 | long rows; | ||
13 | long outoff; | ||
14 | long flag; | ||
15 | } entry_t; | ||
16 | typedef struct { | ||
17 | char *ptr; | ||
18 | size_t size; | ||
19 | } outvec_t; | ||
20 | |||
21 | const char *g_year_map[] = { | ||
22 | "1992_Q2", "1995_Q0", "1996_Q0", "1996_Q1", "1997_Q1", "1997_Q3", "1998_Q1", "1998_Q3", "1999_Q1", "1999_Q3", "2000_Q1", "2000_Q3", "2001_Q1", "2001_Q2", "2001_Q3", "2001_Q4", "2002_Q1", | ||
23 | "2002_Q3", "2003_Q1", "2003_Q3", "2004_Q1", "2004_Q3", "2005_Q1", "2005_Q3", "2006_Q1", "2006_Q3", "2007_Q1", "2007_Q3", "2008_Q1", "2008_Q3", "2009_Q1", "2009_Q3", "2010_Q1", "2010_Q3", | ||
24 | "2011_Q1", "2011_Q3", "2012_Q1", "2012_Q3", "2013_Q1", "2013_Q3", "2014_Q1", "2014_Q3", "2015_Q1", "2015_Q3", "2016_Q1", "2016_Q3", "2017_Q1", "2017_Q3", "2018_Q1", "2018_Q3", "2019_Q1", | ||
25 | 0 | ||
26 | }; | ||
27 | |||
28 | void SKIP_1_COLUMN(char **ptr) { *ptr = strchr(*ptr, 10) + 1; } | ||
29 | void SKIP_2_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); } | ||
30 | void SKIP_3_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); } | ||
31 | |||
32 | int year_to_offset(const char *year) { | ||
33 | const char **y = g_year_map; | ||
34 | int off = 0; | ||
35 | while (*y) { | ||
36 | if (!memcmp(year, *y, 7)) return off; | ||
37 | ++off; ++y; | ||
38 | } | ||
39 | return -1; | ||
40 | } | ||
41 | |||
42 | |||
43 | int | ||
44 | STRCMP_n (const char *p1, const char *p2) | ||
45 | { | ||
46 | const unsigned char *s1 = (const unsigned char *) p1; | ||
47 | const unsigned char *s2 = (const unsigned char *) p2; | ||
48 | unsigned char c1, c2; | ||
49 | do | ||
50 | { | ||
51 | c1 = (unsigned char) *s1++; | ||
52 | c2 = (unsigned char) *s2++; | ||
53 | if (c1 == 10) | ||
54 | return c1 - c2; | ||
55 | } | ||
56 | while (c1 == c2); | ||
57 | return c1 - c2; | ||
58 | } | ||
59 | |||
60 | int compare_entries(entry_t*a, entry_t*b, int *prec) { | ||
61 | char *pa = a->ptr, *pb = b->ptr; | ||
62 | int col, row, res = 0, nprec = -1; | ||
63 | |||
64 | /* Multi line entries never match single line entries */ | ||
65 | if (a->rows != b->rows) | ||
66 | return -1; | ||
67 | |||
68 | /* Assume house number precision first .. unless */ | ||
69 | if (!memcmp(pa,"2006_Q3",7)) | ||
70 | *prec = 2; | ||
71 | else | ||
72 | *prec = 3; | ||
73 | |||
74 | if (!memcmp(pb,"2006_Q3",7)) | ||
75 | nprec = 2; | ||
76 | else | ||
77 | nprec = 3; | ||
78 | |||
79 | /* Skip year and flags */ | ||
80 | SKIP_2_COLUMNS(&pa); | ||
81 | SKIP_2_COLUMNS(&pb); | ||
82 | |||
83 | /* Test all columns for identity */ | ||
84 | for (col=2; col<COLUMNS-1; ++col) { | ||
85 | if (!res && STRCMP_n(pa, pb)) | ||
86 | res = -1; | ||
87 | SKIP_1_COLUMN(&pa); | ||
88 | SKIP_1_COLUMN(&pb); | ||
89 | } | ||
90 | |||
91 | /* If no coords, downgrade precision */ | ||
92 | if (*pa == 9) *prec = 1; | ||
93 | if (*pb == 9) nprec = 1; | ||
94 | |||
95 | /* If entries differ, return after precision has been found */ | ||
96 | if (res) return res; | ||
97 | |||
98 | /* Only if precision is the same, difference in coordinates | ||
99 | is significant. | ||
100 | if (*prec == nprec && STRCMP_n(pa, pb)) | ||
101 | return -1; */ | ||
102 | |||
103 | /* Row 1 has been compared, check the rest of lines */ | ||
104 | for (row=0; row<a->rows; ++row) { | ||
105 | |||
106 | /* Skip last row's coordinate columns, year and flags */ | ||
107 | SKIP_3_COLUMNS(&pa); | ||
108 | SKIP_3_COLUMNS(&pb); | ||
109 | |||
110 | for (col=2; col<COLUMNS-1; ++col) { | ||
111 | if (STRCMP_n(pa, pb)) | ||
112 | return -1; | ||
113 | SKIP_1_COLUMN(&pa); | ||
114 | SKIP_1_COLUMN(&pb); | ||
115 | } | ||
116 | |||
117 | /* Only if precision is the same, difference in coordinates | ||
118 | is significant. | ||
119 | if (*prec == nprec && STRCMP_n(pa, pb)) | ||
120 | return -1; */ | ||
121 | } | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | int sort_me(const void *f_a, const void *f_b) { | ||
126 | entry_t *e_a = (entry_t *)f_a; | ||
127 | entry_t *e_b = (entry_t *)f_b; | ||
128 | |||
129 | char * pa = (char*)e_a->ptr; | ||
130 | char * pb = (char*)e_b->ptr; | ||
131 | |||
132 | int results[COLUMNS], c; | ||
133 | |||
134 | if (e_a->rows != e_b->rows) | ||
135 | return e_a->rows - e_b->rows; | ||
136 | |||
137 | for (c = 0; c<COLUMNS; ++c) { | ||
138 | results[c] = STRCMP_n(pa, pb); | ||
139 | SKIP_1_COLUMN(&pa); | ||
140 | SKIP_1_COLUMN(&pb); | ||
141 | } | ||
142 | |||
143 | if (results[10]) return results[10]; /* Vorwahl */ | ||
144 | if (results[11]) return results[11]; /* Rufnummer */ | ||
145 | if (results[2]) return results[2]; /* PLZ */ | ||
146 | if (results[3]) return results[3]; /* Nachname */ | ||
147 | if (results[4]) return results[4]; /* Vorname */ | ||
148 | if (results[6]) return results[6]; /* Strasse */ | ||
149 | if (results[7]) return results[7]; /* Hausnummer */ | ||
150 | if (results[8]) return results[8]; /* Verweise */ | ||
151 | if (results[0]) return results[0]; /* Year */ | ||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | enum { OUTPUT_BUFFER_SIZE = 1024*1024*128 }; | ||
156 | |||
157 | static void do_escape_string(char * s, size_t len) { | ||
158 | size_t i; | ||
159 | |||
160 | putchar('"'); | ||
161 | for (i=0; i<len; ++i) { | ||
162 | switch(s[i]) { | ||
163 | case '\\': | ||
164 | putchar('\\'); | ||
165 | case '"': | ||
166 | putchar('\\'); | ||
167 | case ',': | ||
168 | putchar('\\'); | ||
169 | default: | ||
170 | break; | ||
171 | } | ||
172 | putchar(s[i]); | ||
173 | } | ||
174 | putchar('"'); | ||
175 | } | ||
176 | |||
177 | static void escape_string(char * s, size_t len) { | ||
178 | size_t i; | ||
179 | if (len == 4 && ( ( s[0] | 0x20 ) == 'n' ) && ( ( s[1] | 0x20 ) == 'u' ) && ( ( s[2] | 0x20 ) == 'l' ) && ( ( s[3] | 0x20 ) == 'l' ) ) | ||
180 | return do_escape_string(s, len); | ||
181 | for (i=0; i<len; ++i) | ||
182 | switch(s[i]) { | ||
183 | case '{': case '\\': case '}': case '"': case ',': return do_escape_string(s, len); | ||
184 | } | ||
185 | fwrite(s, len, 1, stdout); | ||
186 | } | ||
187 | |||
188 | int main(int argc, char **args) { | ||
189 | MAP tbuch = map_file(args[1], 1); | ||
190 | char *ptr, *start; | ||
191 | entry_t * sort_array; | ||
192 | outvec_t * out_array; | ||
193 | int current = -1, outoff = 0, lines = 1, i, truth = 0, truth_prec = -1; | ||
194 | uint64_t year_list = 0, revflag_list = 0, bizflag_list = 0; | ||
195 | long flag = 0; | ||
196 | |||
197 | /* Estimate upper bound for amount of lines */ | ||
198 | for (i=0; i<tbuch->size; ++i) | ||
199 | if (tbuch->addr[i] == 10) | ||
200 | ++lines; | ||
201 | |||
202 | sort_array = (entry_t*)malloc((lines / COLUMNS) * sizeof(entry_t)); | ||
203 | out_array = (outvec_t*)malloc((lines / COLUMNS) * sizeof(outvec_t)); | ||
204 | |||
205 | ptr = (char*)tbuch->addr; | ||
206 | start = ptr; | ||
207 | |||
208 | while (ptr < (char*)tbuch->addr + tbuch->size) { | ||
209 | int c; | ||
210 | |||
211 | start = ptr; | ||
212 | |||
213 | /* Look for field terminator */ | ||
214 | for (c=0; c<COLUMNS; ++c) { | ||
215 | char * end = strchr(ptr, 10); | ||
216 | if (c==1) { | ||
217 | flag = strtoul(ptr, 0, 16); | ||
218 | out_array[outoff].ptr = end + 1; | ||
219 | } | ||
220 | ptr = end + 1; | ||
221 | } | ||
222 | |||
223 | if (flag&2) { | ||
224 | assert( current >= 0); | ||
225 | sort_array[current].rows++; | ||
226 | } else { | ||
227 | sort_array[++current].ptr = start; | ||
228 | sort_array[current].rows = 0; | ||
229 | sort_array[current].outoff = outoff; | ||
230 | sort_array[current].flag = flag; | ||
231 | } | ||
232 | out_array[outoff].size = ptr - out_array[outoff].ptr; | ||
233 | outoff++; | ||
234 | } | ||
235 | |||
236 | /* Sort the whole thing */ | ||
237 | qsort(sort_array, current, sizeof(entry_t), sort_me); | ||
238 | |||
239 | for (i=0; i<=current; ++i) { | ||
240 | int j, dump = 0, prec; | ||
241 | |||
242 | int year = year_to_offset(sort_array[i].ptr); | ||
243 | |||
244 | year_list |= 1LL << year; | ||
245 | if (sort_array[i].flag & 0x80 ) bizflag_list |= 1LL << year; | ||
246 | if (sort_array[i].flag & 0x40 ) revflag_list |= 1LL << year; | ||
247 | |||
248 | /* The last entry always needs to be dumped, but check if its | ||
249 | precision is better than the old truth's | ||
250 | The second comparision checks for equality of entries (modulo | ||
251 | coordinate mismatch) | ||
252 | */ | ||
253 | if (i == current) { | ||
254 | compare_entries(sort_array+i, sort_array+i, &prec); | ||
255 | dump = 1; | ||
256 | } else if (compare_entries(sort_array+i, sort_array+i+1, &prec)) | ||
257 | dump = 1; | ||
258 | |||
259 | /* If this entry's precision is higher than the one of possible | ||
260 | earlier matches, then the current entry becomes the truth */ | ||
261 | if (prec >= truth_prec) { | ||
262 | truth = i; | ||
263 | truth_prec = prec; | ||
264 | } | ||
265 | |||
266 | if (dump) { | ||
267 | printf("%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t", year_list, bizflag_list, revflag_list); | ||
268 | for (int c=0; c<COLUMNS-2; ++c) { | ||
269 | outvec_t * out = out_array + sort_array[truth].outoff; | ||
270 | int started = 0, skipped = 0; | ||
271 | for (j=0; j<=sort_array[truth].rows; ++j) { | ||
272 | char *s = strchr(out->ptr, 10); | ||
273 | size_t len = s - out->ptr; | ||
274 | if (!len || out->ptr[0] == 9) | ||
275 | skipped++; | ||
276 | else { | ||
277 | if (!started++) | ||
278 | putchar('{'); | ||
279 | else | ||
280 | putchar(','); | ||
281 | for (int x=0; x<skipped; ++x) fputs("null,", stdout); | ||
282 | if (c != COLUMNS-3) | ||
283 | escape_string(out->ptr, len); | ||
284 | else { | ||
285 | char coords[64], *tab; | ||
286 | // memcpy(coords, "POINT(", 6); | ||
287 | // memcpy(coords + 6, out->ptr, len); | ||
288 | // tab = memchr(coords + 6, 9, len); | ||
289 | // if (tab) *tab = ' '; | ||
290 | // coords[6+len] = ')'; | ||
291 | // fwrite(coords, 7 + len, 1, stdout); | ||
292 | memcpy(coords, out->ptr, len); | ||
293 | tab = memchr(coords, 9, len); | ||
294 | if (tab) *tab = ' '; | ||
295 | fwrite(coords, len, 1, stdout); | ||
296 | } | ||
297 | skipped = 0; | ||
298 | } | ||
299 | out->ptr = s + 1; | ||
300 | ++out; | ||
301 | } | ||
302 | if (started) putchar('}'); | ||
303 | if (c<COLUMNS-3) putchar(9); | ||
304 | } | ||
305 | putchar(10); | ||
306 | |||
307 | truth_prec = -1; | ||
308 | year_list = 0; | ||
309 | bizflag_list = 0; | ||
310 | revflag_list = 0; | ||
311 | } | ||
312 | } | ||
313 | } | ||
diff --git a/src/postprocess/postgres.txt b/src/postprocess/postgres.txt new file mode 100644 index 0000000..d1b4669 --- /dev/null +++ b/src/postprocess/postgres.txt | |||
@@ -0,0 +1,41 @@ | |||
1 | CREATE TABLE telefonbuch ( | ||
2 | presence_flag bigint, | ||
3 | reverse_flag bigint, | ||
4 | biz_flag bigint, | ||
5 | zip character varying(64)[] DEFAULT '{}', | ||
6 | nachname character varying(256)[] DEFAULT '{}', | ||
7 | vorname character varying(192)[] DEFAULT '{}', | ||
8 | zusaetze character varying(512)[] DEFAULT '{}', | ||
9 | strasse character varying(64)[] DEFAULT '{}', | ||
10 | hausnummer character varying(32)[] DEFAULT '{}', | ||
11 | verweise text[] DEFAULT '{}', | ||
12 | ort character varying(64)[] DEFAULT '{}', | ||
13 | vorwahl character varying(10)[] DEFAULT '{}', | ||
14 | rufnummer character varying(32)[] DEFAULT '{}', | ||
15 | web character varying(256)[] DEFAULT '{}', | ||
16 | email character varying(256)[] DEFAULT '{}', | ||
17 | coords character varying(32)[] DEFAULT '{}' | ||
18 | ); | ||
19 | |||
20 | COPY Telefonbuch ( presence_flag, reverse_flag, biz_flag, zip, nachname, vorname, zusaetze, strasse, hausnummer, verweise, ort, vorwahl, rufnummer, web, email, coords ) FROM 'telefonbuch.txt' WITH NULL AS 'NULL'; | ||
21 | |||
22 | CREATE MATERIALZED VIEW nachname_view AS SELECT id, nachname FROM telefonbuch CROSS JOIN UNNEST(nachname) WITH ORDINALITY u(name, name_id) WITH data; | ||
23 | |||
24 | CREATE INDEX idx_hausnummer ON public.telefonbuch USING btree (hausnummer); | ||
25 | CREATE INDEX idx_multilinegroup ON public.telefonbuch USING btree (multilinegroup); | ||
26 | CREATE INDEX idx_nachname ON public.telefonbuch USING btree (nachname); | ||
27 | CREATE INDEX idx_ort ON public.telefonbuch USING btree (ort); | ||
28 | CREATE INDEX idx_rufnummer ON public.telefonbuch USING btree (rufnummer); | ||
29 | CREATE INDEX idx_strasse ON public.telefonbuch USING btree (strasse); | ||
30 | CREATE INDEX idx_vorname ON public.telefonbuch USING btree (vorname); | ||
31 | CREATE INDEX idx_vorwahl ON public.telefonbuch USING btree (vorwahl); | ||
32 | CREATE INDEX idx_zip ON public.telefonbuch USING btree (zip); | ||
33 | CREATE INDEX idx_zusaetze ON public.telefonbuch USING btree (zusaetze); | ||
34 | |||
35 | CREATE INDEX trgm_idx_nachname ON public.telefonbuch USING gin (nachname gin_trgm_ops); | ||
36 | CREATE INDEX trgm_idx_rufnummer ON public.telefonbuch USING gin (rufnummer gin_trgm_ops); | ||
37 | CREATE INDEX trgm_idx_verweise ON public.telefonbuch USING gin (verweise gin_trgm_ops); | ||
38 | CREATE INDEX trgm_idx_vorname ON public.telefonbuch USING gin (vorname gin_trgm_ops); | ||
39 | CREATE INDEX trgm_idx_zusaetze ON public.telefonbuch USING gin (zusaetze gin_trgm_ops); | ||
40 | |||
41 | SELECT *, ST_AsText(coords) FROM Telefonbuch WHERE nachname = 'Engling'; | ||
diff --git a/src/postprocess/postprocess.sh b/src/postprocess/postprocess.sh new file mode 100644 index 0000000..cc63a4e --- /dev/null +++ b/src/postprocess/postprocess.sh | |||
@@ -0,0 +1,18 @@ | |||
1 | export LANG=C | ||
2 | export LC_ALL=C | ||
3 | export LC_CTYPE=C | ||
4 | |||
5 | sudo sysctl kern.maxfilesperproc=65563 | ||
6 | sudo sysctl kern.maxfiles=65563 | ||
7 | ulimit -n 65563 | ||
8 | |||
9 | for a in 199[56789]_Q? 20*_Q?; do echo $a; ./sort_plz $a; done | ||
10 | |||
11 | rm -rf sorted | ||
12 | mkdir sorted | ||
13 | cd output | ||
14 | |||
15 | for a in *; do echo $a >&2; ../merge_entries $a | iconv -f iso8859-15 -t utf-8 > ../sorted/$a; done | ||
16 | |||
17 | cd ../output | ||
18 | |||
diff --git a/src/postprocess/sort_plz.c b/src/postprocess/sort_plz.c index 26ac9d0..4c30ea3 100644 --- a/src/postprocess/sort_plz.c +++ b/src/postprocess/sort_plz.c | |||
@@ -17,10 +17,10 @@ | |||
17 | #include <unistd.h> | 17 | #include <unistd.h> |
18 | #include <err.h> | 18 | #include <err.h> |
19 | 19 | ||
20 | enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_14, F_16, F_COUNT }; | 20 | enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_14, F_15, F_16, F_COUNT }; |
21 | 21 | ||
22 | static char *g_filenames[] = { | 22 | static char *g_filenames[] = { |
23 | "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "14_15_Email_Webadresse", "16_Koordinaten" }; | 23 | "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "14_Webadresse", "15_Email", "16_Koordinaten" }; |
24 | 24 | ||
25 | typedef struct { | 25 | typedef struct { |
26 | char plz[8]; | 26 | char plz[8]; |