sort.c (9669B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <ctype.h> 3 #include <stdio.h> 4 #include <stdlib.h> 5 #include <string.h> 6 7 #include "queue.h" 8 #include "text.h" 9 #include "utf.h" 10 #include "util.h" 11 12 struct keydef { 13 int start_column; 14 int end_column; 15 int start_char; 16 int end_char; 17 int flags; 18 TAILQ_ENTRY(keydef) entry; 19 }; 20 21 struct column { 22 struct line line; 23 size_t cap; 24 }; 25 26 enum { 27 MOD_N = 1 << 0, 28 MOD_STARTB = 1 << 1, 29 MOD_ENDB = 1 << 2, 30 MOD_R = 1 << 3, 31 MOD_D = 1 << 4, 32 MOD_F = 1 << 5, 33 MOD_I = 1 << 6, 34 MOD_H = 1 << 7, 35 }; 36 37 static TAILQ_HEAD(kdhead, keydef) kdhead = TAILQ_HEAD_INITIALIZER(kdhead); 38 39 static int Cflag = 0, cflag = 0, uflag = 0; 40 static char *fieldsep = NULL; 41 static size_t fieldseplen = 0; 42 static struct column col1, col2; 43 44 static void 45 skipblank(struct line *a) 46 { 47 while (a->len && (*(a->data) == ' ' || *(a->data) == '\t')) { 48 a->data++; 49 a->len--; 50 } 51 } 52 53 static void 54 skipnonblank(struct line *a) 55 { 56 while (a->len && (*(a->data) != '\n' && *(a->data) != ' ' && 57 *(a->data) != '\t')) { 58 a->data++; 59 a->len--; 60 } 61 } 62 63 static void 64 skipcolumn(struct line *a, int skip_to_next_col) 65 { 66 char *s; 67 68 if (fieldsep) { 69 if ((s = memmem(a->data, a->len, fieldsep, fieldseplen))) { 70 if (skip_to_next_col) { 71 s += fieldseplen; 72 a->data = s; 73 a->len = a->len - (s - a->data); 74 } 75 } else { 76 a->data += a->len - 1; 77 a->len = 1; 78 } 79 } else { 80 skipblank(a); 81 skipnonblank(a); 82 } 83 } 84 85 static void 86 columns(struct line *line, const struct keydef *kd, struct column *col) 87 { 88 Rune r; 89 struct line start, end; 90 size_t utflen, rlen; 91 int i; 92 93 start.data = line->data; 94 start.len = line->len; 95 for (i = 1; i < kd->start_column; i++) 96 skipcolumn(&start, 1); 97 if (kd->flags & MOD_STARTB) 98 skipblank(&start); 99 for (utflen = 0; start.len > 1 && utflen < kd->start_char - 1;) { 100 rlen = chartorune(&r, start.data); 101 start.data += rlen; 102 start.len -= rlen; 103 utflen++; 104 } 105 106 end.data = line->data; 107 end.len = line->len; 108 if (kd->end_column) { 109 for (i = 1; i < kd->end_column; i++) 110 skipcolumn(&end, 1); 111 if (kd->flags & MOD_ENDB) 112 skipblank(&end); 113 if (kd->end_char) { 114 for (utflen = 0; end.len > 1 && utflen < kd->end_char;) { 115 rlen = chartorune(&r, end.data); 116 end.data += rlen; 117 end.len -= rlen; 118 utflen++; 119 } 120 } else { 121 skipcolumn(&end, 0); 122 } 123 } else { 124 end.data += end.len - 1; 125 end.len = 1; 126 } 127 col->line.len = MAX(0, end.data - start.data); 128 if (!(col->line.data) || col->cap < col->line.len + 1) { 129 free(col->line.data); 130 col->line.data = emalloc(col->line.len + 1); 131 } 132 memcpy(col->line.data, start.data, col->line.len); 133 col->line.data[col->line.len] = '\0'; 134 } 135 136 static int 137 skipmodcmp(struct line *a, struct line *b, int flags) 138 { 139 Rune r1, r2; 140 size_t offa = 0, offb = 0; 141 142 do { 143 offa += chartorune(&r1, a->data + offa); 144 offb += chartorune(&r2, b->data + offb); 145 146 if (flags & MOD_D && flags & MOD_I) { 147 while (offa < a->len && ((!isblankrune(r1) && 148 !isalnumrune(r1)) || (!isprintrune(r1)))) 149 offa += chartorune(&r1, a->data + offa); 150 while (offb < b->len && ((!isblankrune(r2) && 151 !isalnumrune(r2)) || (!isprintrune(r2)))) 152 offb += chartorune(&r2, b->data + offb); 153 } 154 else if (flags & MOD_D) { 155 while (offa < a->len && !isblankrune(r1) && 156 !isalnumrune(r1)) 157 offa += chartorune(&r1, a->data + offa); 158 while (offb < b->len && !isblankrune(r2) && 159 !isalnumrune(r2)) 160 offb += chartorune(&r2, b->data + offb); 161 } 162 else if (flags & MOD_I) { 163 while (offa < a->len && !isprintrune(r1)) 164 offa += chartorune(&r1, a->data + offa); 165 while (offb < b->len && !isprintrune(r2)) 166 offb += chartorune(&r2, b->data + offb); 167 } 168 if (flags & MOD_F) { 169 r1 = toupperrune(r1); 170 r2 = toupperrune(r2); 171 } 172 } while (r1 && r1 == r2); 173 174 return r1 - r2; 175 } 176 177 static int 178 slinecmp(struct line *a, struct line *b) 179 { 180 int res = 0; 181 double x, y; 182 struct keydef *kd; 183 184 TAILQ_FOREACH(kd, &kdhead, entry) { 185 columns(a, kd, &col1); 186 columns(b, kd, &col2); 187 188 /* if -u is given, don't use default key definition 189 * unless it is the only one */ 190 if (uflag && kd == TAILQ_LAST(&kdhead, kdhead) && 191 TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) { 192 res = 0; 193 } else if (kd->flags & MOD_N) { 194 x = strtod(col1.line.data, NULL); 195 y = strtod(col2.line.data, NULL); 196 res = (x < y) ? -1 : (x > y); 197 } else if (kd->flags & (MOD_D | MOD_F | MOD_I)) { 198 res = skipmodcmp(&col1.line, &col2.line, kd->flags); 199 } else { 200 res = linecmp(&col1.line, &col2.line); 201 } 202 203 if (kd->flags & MOD_R) 204 res = -res; 205 if (res) 206 break; 207 } 208 209 return res; 210 } 211 212 static int 213 check(FILE *fp, const char *fname) 214 { 215 static struct line prev, cur, tmp; 216 static size_t prevsize, cursize, tmpsize; 217 ssize_t len; 218 219 if (!prev.data) { 220 if ((len = getline(&prev.data, &prevsize, fp)) < 0) 221 eprintf("getline:"); 222 prev.len = len; 223 } 224 while ((len = getline(&cur.data, &cursize, fp)) > 0) { 225 cur.len = len; 226 if (uflag > slinecmp(&cur, &prev)) { 227 if (!Cflag) { 228 weprintf("disorder %s: ", fname); 229 fwrite(cur.data, 1, cur.len, stderr); 230 } 231 return 1; 232 } 233 tmp = cur; 234 tmpsize = cursize; 235 cur = prev; 236 cursize = prevsize; 237 prev = tmp; 238 prevsize = tmpsize; 239 } 240 241 return 0; 242 } 243 244 static int 245 parse_flags(char **s, int *flags, int bflag) 246 { 247 while (isalpha((int)**s)) { 248 switch (*((*s)++)) { 249 case 'b': 250 *flags |= bflag; 251 break; 252 case 'd': 253 *flags |= MOD_D; 254 break; 255 case 'f': 256 *flags |= MOD_F; 257 break; 258 case 'i': 259 *flags |= MOD_I; 260 break; 261 case 'n': 262 *flags |= MOD_N; 263 break; 264 case 'r': 265 *flags |= MOD_R; 266 break; 267 default: 268 return -1; 269 } 270 } 271 272 return 0; 273 } 274 275 static void 276 addkeydef(char *kdstr, int flags) 277 { 278 struct keydef *kd; 279 280 kd = enmalloc(2, sizeof(*kd)); 281 282 /* parse key definition kdstr with format 283 * start_column[.start_char][flags][,end_column[.end_char][flags]] 284 */ 285 kd->start_column = 1; 286 kd->start_char = 1; 287 kd->end_column = 0; /* 0 means end of line */ 288 kd->end_char = 0; /* 0 means end of column */ 289 kd->flags = flags; 290 291 if ((kd->start_column = strtol(kdstr, &kdstr, 10)) < 1) 292 enprintf(2, "invalid start column in key definition\n"); 293 294 if (*kdstr == '.') { 295 if ((kd->start_char = strtol(kdstr + 1, &kdstr, 10)) < 1) 296 enprintf(2, "invalid start character in key " 297 "definition\n"); 298 } 299 if (parse_flags(&kdstr, &kd->flags, MOD_STARTB) < 0) 300 enprintf(2, "invalid start flags in key definition\n"); 301 302 if (*kdstr == ',') { 303 if ((kd->end_column = strtol(kdstr + 1, &kdstr, 10)) < 0) 304 enprintf(2, "invalid end column in key definition\n"); 305 if (*kdstr == '.') { 306 if ((kd->end_char = strtol(kdstr + 1, &kdstr, 10)) < 0) 307 enprintf(2, "invalid end character in key " 308 "definition\n"); 309 } 310 if (parse_flags(&kdstr, &kd->flags, MOD_ENDB) < 0) 311 enprintf(2, "invalid end flags in key definition\n"); 312 } 313 314 if (*kdstr != '\0') 315 enprintf(2, "invalid key definition\n"); 316 317 TAILQ_INSERT_TAIL(&kdhead, kd, entry); 318 } 319 320 static void 321 usage(void) 322 { 323 enprintf(2, "usage: %s [-Cbcdfhimnru] [-o outfile] [-t delim] " 324 "[-k def]... [file ...]\n", argv0); 325 } 326 327 int 328 main(int argc, char *argv[]) 329 { 330 FILE *fp, *ofp = stdout; 331 struct linebuf linebuf = EMPTY_LINEBUF; 332 size_t i; 333 int global_flags = 0, ret = 0; 334 char *outfile = NULL; 335 336 ARGBEGIN { 337 case 'C': 338 Cflag = 1; 339 break; 340 case 'b': 341 global_flags |= MOD_STARTB | MOD_ENDB; 342 break; 343 case 'c': 344 cflag = 1; 345 break; 346 case 'd': 347 global_flags |= MOD_D; 348 break; 349 case 'f': 350 global_flags |= MOD_F; 351 break; 352 case 'h': 353 global_flags |= MOD_H; 354 break; 355 case 'i': 356 global_flags |= MOD_I; 357 break; 358 case 'k': 359 addkeydef(EARGF(usage()), global_flags); 360 break; 361 case 'm': 362 /* more or less for free, but for performance-reasons, 363 * we should keep this flag in mind and maybe some later 364 * day implement it properly so we don't run out of memory 365 * while merging large sorted files. 366 */ 367 break; 368 case 'n': 369 global_flags |= MOD_N; 370 break; 371 case 'o': 372 outfile = EARGF(usage()); 373 break; 374 case 'r': 375 global_flags |= MOD_R; 376 break; 377 case 't': 378 fieldsep = EARGF(usage()); 379 if (!*fieldsep) 380 eprintf("empty delimiter\n"); 381 fieldseplen = unescape(fieldsep); 382 break; 383 case 'u': 384 uflag = 1; 385 break; 386 default: 387 usage(); 388 } ARGEND 389 390 /* -b shall only apply to custom key definitions */ 391 if (TAILQ_EMPTY(&kdhead) && global_flags) 392 addkeydef("1", global_flags & ~(MOD_STARTB | MOD_ENDB)); 393 addkeydef("1", global_flags & MOD_R); 394 395 if (!argc) { 396 if (Cflag || cflag) { 397 if (check(stdin, "<stdin>") && !ret) 398 ret = 1; 399 } else { 400 getlines(stdin, &linebuf); 401 } 402 } else for (; *argv; argc--, argv++) { 403 if (!strcmp(*argv, "-")) { 404 *argv = "<stdin>"; 405 fp = stdin; 406 } else if (!(fp = fopen(*argv, "r"))) { 407 enprintf(2, "fopen %s:", *argv); 408 continue; 409 } 410 if (Cflag || cflag) { 411 if (check(fp, *argv) && !ret) 412 ret = 1; 413 } else { 414 getlines(fp, &linebuf); 415 } 416 if (fp != stdin && fshut(fp, *argv)) 417 ret = 2; 418 } 419 420 if (!Cflag && !cflag) { 421 if (outfile && !(ofp = fopen(outfile, "w"))) 422 eprintf("fopen %s:", outfile); 423 424 if ((global_flags & MOD_H) && linebuf.nlines > 1) { 425 qsort(linebuf.lines + 1, linebuf.nlines - 1, sizeof(*linebuf.lines), 426 (int (*)(const void *, const void *))slinecmp); 427 } else { 428 qsort(linebuf.lines, linebuf.nlines, sizeof(*linebuf.lines), 429 (int (*)(const void *, const void *))slinecmp); 430 } 431 432 for (i = 0; i < linebuf.nlines; i++) { 433 if (!uflag || i == 0 || 434 slinecmp(&linebuf.lines[i], &linebuf.lines[i - 1])) { 435 fwrite(linebuf.lines[i].data, 1, 436 linebuf.lines[i].len, ofp); 437 } 438 } 439 } 440 441 if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>") | 442 fshut(stderr, "<stderr>")) 443 ret = 2; 444 445 return ret; 446 }