diff options
author | Dirk Engling <erdgeist@erdgeist.org> | 2019-02-02 22:56:02 +0100 |
---|---|---|
committer | Dirk Engling <erdgeist@erdgeist.org> | 2019-02-02 22:56:02 +0100 |
commit | a73a9f7ebe6c82a9210e63700481b0b2dfcb0b4b (patch) | |
tree | 8827b08d032b9b334b8bb0cc7bae5ae3998a7a95 /src | |
parent | 1e838c25d6fb9e9793b8dd4adbf5ce3078e1d0e5 (diff) |
First attempt to sort for unifications
Diffstat (limited to 'src')
-rw-r--r-- | src/postprocess/sort_plz.c | 132 |
1 files changed, 132 insertions, 0 deletions
diff --git a/src/postprocess/sort_plz.c b/src/postprocess/sort_plz.c new file mode 100644 index 0000000..f44cec7 --- /dev/null +++ b/src/postprocess/sort_plz.c | |||
@@ -0,0 +1,132 @@ | |||
1 | /* | ||
2 | target is in current directory: | ||
3 | entries_single/<PLZ> entries_multi/<PLZ> | ||
4 | <PLZ> is either [0-9_]{5} or _____ or brken | ||
5 | opens files in source directory: | ||
6 | 01_Flags 02_Nachname 03_Vorname 04_Zusaetze 07_Strasse 08_Hausnummer 09_Verweise 10_Postleitzahl | ||
7 | 11_Ort 12_Vorwahl 13_Rufnummer 16_Koordinaten | ||
8 | appends to all of the above dirs plus | ||
9 | 00_Jahr | ||
10 | */ | ||
11 | |||
12 | #include <sys/stat.h> | ||
13 | #include <stdlib.h> | ||
14 | #include <stdio.h> | ||
15 | #include <string.h> | ||
16 | #include <errno.h> | ||
17 | #include <err.h> | ||
18 | |||
19 | enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_16, F_COUNT }; | ||
20 | |||
21 | static char *g_filenames[] = { | ||
22 | "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "16_Koordinaten" }; | ||
23 | |||
24 | FILE * fopen_prefix(char *prefix, int file_id, int readonly) { | ||
25 | char filename[1024]; | ||
26 | snprintf( filename, sizeof(filename), "%s/%s", prefix, g_filenames[file_id]); | ||
27 | return fopen(filename, readonly ? "r" : "a"); | ||
28 | } | ||
29 | |||
30 | int main(int argc, char **args) { | ||
31 | FILE * in_handles[F_COUNT] = { NULL }; | ||
32 | FILE * out_handles[F_COUNT] = { NULL }; | ||
33 | char flags[4]; | ||
34 | int i, in_multi = 0; | ||
35 | char *input = malloc(1024); | ||
36 | size_t input_size = 1024; | ||
37 | |||
38 | /* First open all input files */ | ||
39 | for (i=F_01; i<F_COUNT; ++i) { | ||
40 | in_handles[i] = fopen_prefix(args[1], i, 1); | ||
41 | if (!in_handles[i]) | ||
42 | errx( 1, "Couldn't open file %s\n", g_filenames[i]); | ||
43 | } | ||
44 | |||
45 | mkdir( "multi", 0755); | ||
46 | mkdir( "single", 0755); | ||
47 | |||
48 | /* Get Flags to check if we're processing a continuation */ | ||
49 | while (fgets(flags, 4, in_handles[F_01])) { | ||
50 | char out_dir[32]; | ||
51 | ssize_t linelen; | ||
52 | char flag = strtoul(flags, 0, 16); | ||
53 | char *type = flag & 1 ? "multi/" : "single/"; | ||
54 | |||
55 | /* If we're in multiline mode, we just copy lines as long as we see continuations */ | ||
56 | if (in_multi) { | ||
57 | if (flag & 0x2) { | ||
58 | fputs(args[1], out_handles[F_00]); // write Jahr | ||
59 | fputc(10, out_handles[F_00]); | ||
60 | fwrite(flags, 3, 1, out_handles[F_01]); // copy Flags verbatim | ||
61 | for (i=F_10; i<F_COUNT; ++i) { // process the rest of entries | ||
62 | ssize_t linelen = getline(&input, &input_size, in_handles[i]); | ||
63 | fwrite(input, linelen, 1, out_handles[i]); | ||
64 | } | ||
65 | continue; | ||
66 | } | ||
67 | /* If the entry is not a continuation, close all output files and switch off multi mode */ | ||
68 | for (i=0; i<F_COUNT; ++i) { | ||
69 | fclose(out_handles[i]); | ||
70 | out_handles[i] = NULL; | ||
71 | } | ||
72 | in_multi = 0; | ||
73 | } | ||
74 | |||
75 | if (flag & 0x1) | ||
76 | in_multi = 1; | ||
77 | |||
78 | /* Read Postleitzahl to get destination */ | ||
79 | linelen = getline(&input, &input_size, in_handles[F_10]); | ||
80 | if (linelen && input[linelen - 1] == 10) { // chomp | ||
81 | input[linelen - 1] = 0; | ||
82 | --linelen; | ||
83 | } | ||
84 | |||
85 | if (linelen == 0) // empty PLZ | ||
86 | strcpy(out_dir, in_multi ? "multi/_____" : "single/_____"); | ||
87 | else if (linelen == 5) { // potentially normal | ||
88 | int broken = 0; | ||
89 | char * dest = out_dir + sprintf(out_dir, in_multi ? "multi/" : "single/"); | ||
90 | for (i=0; i<5; ++i) { | ||
91 | if ( (input[i] < '0' || input[i] > '9') && input[i] != '.') { | ||
92 | broken = 1; | ||
93 | break; | ||
94 | } | ||
95 | dest[i] = input[i]; | ||
96 | if (dest[i] == '.') dest[i] = '_'; | ||
97 | } | ||
98 | dest[5] = 0; | ||
99 | if (broken) | ||
100 | strcpy(out_dir, in_multi ? "multi/broken" : "single/broken"); | ||
101 | } else | ||
102 | strcpy(out_dir, in_multi ? "multi/broken" : "single/broken"); | ||
103 | |||
104 | if (mkdir(out_dir, 0755) == -1 && errno != EEXIST) | ||
105 | errx( 1, "Couldn't create directory %s %d\n", out_dir, errno); | ||
106 | |||
107 | for (i=F_00; i<F_COUNT; ++i) { | ||
108 | out_handles[i] = fopen_prefix(out_dir, i, 0); | ||
109 | if (!out_handles[i]) | ||
110 | errx( 1, "Couldn't open file %s\n", g_filenames[i]); | ||
111 | } | ||
112 | |||
113 | fputs(args[1], out_handles[F_00]); // write Jahr | ||
114 | fputc(10, out_handles[F_00]); | ||
115 | fwrite(flags, 3, 1, out_handles[F_01]); // copy Flags verbatim | ||
116 | fputs(input, out_handles[F_10]); // copy Postleitzahl verbatim | ||
117 | fputc(10, out_handles[F_10]); | ||
118 | |||
119 | for (i=F_02; i<F_COUNT; ++i) { // process the rest of entries | ||
120 | ssize_t linelen = getline(&input, &input_size, in_handles[i]); | ||
121 | fwrite(input, linelen, 1, out_handles[i]); | ||
122 | } | ||
123 | |||
124 | if (!in_multi) | ||
125 | for (i=0; i<F_COUNT; ++i) { | ||
126 | fclose(out_handles[i]); | ||
127 | out_handles[i] = NULL; | ||
128 | } | ||
129 | } | ||
130 | |||
131 | return 0; | ||
132 | } | ||