sbase

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

sort.c (9669B)


      1 /* See LICENSE file for copyright and license details. */
      2 #include <ctype.h>
      3 #include <stdio.h>
      4 #include <stdlib.h>
      5 #include <string.h>
      6 
      7 #include "queue.h"
      8 #include "text.h"
      9 #include "utf.h"
     10 #include "util.h"
     11 
     12 struct keydef {
     13 	int start_column;
     14 	int end_column;
     15 	int start_char;
     16 	int end_char;
     17 	int flags;
     18 	TAILQ_ENTRY(keydef) entry;
     19 };
     20 
     21 struct column {
     22 	struct line line;
     23 	size_t cap;
     24 };
     25 
     26 enum {
     27 	MOD_N      = 1 << 0,
     28 	MOD_STARTB = 1 << 1,
     29 	MOD_ENDB   = 1 << 2,
     30 	MOD_R      = 1 << 3,
     31 	MOD_D      = 1 << 4,
     32 	MOD_F      = 1 << 5,
     33 	MOD_I      = 1 << 6,
     34 	MOD_H      = 1 << 7,
     35 };
     36 
     37 static TAILQ_HEAD(kdhead, keydef) kdhead = TAILQ_HEAD_INITIALIZER(kdhead);
     38 
     39 static int Cflag = 0, cflag = 0, uflag = 0;
     40 static char *fieldsep = NULL;
     41 static size_t fieldseplen = 0;
     42 static struct column col1, col2;
     43 
     44 static void
     45 skipblank(struct line *a)
     46 {
     47 	while (a->len && (*(a->data) == ' ' || *(a->data) == '\t')) {
     48 		a->data++;
     49 		a->len--;
     50 	}
     51 }
     52 
     53 static void
     54 skipnonblank(struct line *a)
     55 {
     56 	while (a->len && (*(a->data) != '\n' && *(a->data) != ' ' &&
     57 	                  *(a->data) != '\t')) {
     58 		a->data++;
     59 		a->len--;
     60 	}
     61 }
     62 
     63 static void
     64 skipcolumn(struct line *a, int skip_to_next_col)
     65 {
     66 	char *s;
     67 
     68 	if (fieldsep) {
     69 		if ((s = memmem(a->data, a->len, fieldsep, fieldseplen))) {
     70 			if (skip_to_next_col) {
     71 				s += fieldseplen;
     72 				a->data = s;
     73 				a->len = a->len - (s - a->data);
     74 			}
     75 		} else {
     76 			a->data += a->len - 1;
     77 			a->len = 1;
     78 		}
     79 	} else {
     80 		skipblank(a);
     81 		skipnonblank(a);
     82 	}
     83 }
     84 
     85 static void
     86 columns(struct line *line, const struct keydef *kd, struct column *col)
     87 {
     88 	Rune r;
     89 	struct line start, end;
     90 	size_t utflen, rlen;
     91 	int i;
     92 
     93 	start.data = line->data;
     94 	start.len = line->len;
     95 	for (i = 1; i < kd->start_column; i++)
     96 		skipcolumn(&start, 1);
     97 	if (kd->flags & MOD_STARTB)
     98 		skipblank(&start);
     99 	for (utflen = 0; start.len > 1 && utflen < kd->start_char - 1;) {
    100 		rlen = chartorune(&r, start.data);
    101 		start.data += rlen;
    102 		start.len -= rlen;
    103 		utflen++;
    104 	}
    105 
    106 	end.data = line->data;
    107 	end.len = line->len;
    108 	if (kd->end_column) {
    109 		for (i = 1; i < kd->end_column; i++)
    110 			skipcolumn(&end, 1);
    111 		if (kd->flags & MOD_ENDB)
    112 			skipblank(&end);
    113 		if (kd->end_char) {
    114 			for (utflen = 0; end.len > 1 && utflen < kd->end_char;) {
    115 				rlen = chartorune(&r, end.data);
    116 				end.data += rlen;
    117 				end.len -= rlen;
    118 				utflen++;
    119 			}
    120 		} else {
    121 			skipcolumn(&end, 0);
    122 		}
    123 	} else {
    124 		end.data += end.len - 1;
    125 		end.len = 1;
    126 	}
    127 	col->line.len = MAX(0, end.data - start.data);
    128 	if (!(col->line.data) || col->cap < col->line.len + 1) {
    129 		free(col->line.data);
    130 		col->line.data = emalloc(col->line.len + 1);
    131 	}
    132 	memcpy(col->line.data, start.data, col->line.len);
    133 	col->line.data[col->line.len] = '\0';
    134 }
    135 
    136 static int
    137 skipmodcmp(struct line *a, struct line *b, int flags)
    138 {
    139 	Rune r1, r2;
    140 	size_t offa = 0, offb = 0;
    141 
    142 	do {
    143 		offa += chartorune(&r1, a->data + offa);
    144 		offb += chartorune(&r2, b->data + offb);
    145 
    146 		if (flags & MOD_D && flags & MOD_I) {
    147 			while (offa < a->len && ((!isblankrune(r1) &&
    148 			       !isalnumrune(r1)) || (!isprintrune(r1))))
    149 				offa += chartorune(&r1, a->data + offa);
    150 			while (offb < b->len && ((!isblankrune(r2) &&
    151 			       !isalnumrune(r2)) || (!isprintrune(r2))))
    152 				offb += chartorune(&r2, b->data + offb);
    153 		}
    154 		else if (flags & MOD_D) {
    155 			while (offa < a->len && !isblankrune(r1) &&
    156 			       !isalnumrune(r1))
    157 				offa += chartorune(&r1, a->data + offa);
    158 			while (offb < b->len && !isblankrune(r2) &&
    159 			       !isalnumrune(r2))
    160 				offb += chartorune(&r2, b->data + offb);
    161 		}
    162 		else if (flags & MOD_I) {
    163 			while (offa < a->len && !isprintrune(r1))
    164 				offa += chartorune(&r1, a->data + offa);
    165 			while (offb < b->len && !isprintrune(r2))
    166 				offb += chartorune(&r2, b->data + offb);
    167 		}
    168 		if (flags & MOD_F) {
    169 			r1 = toupperrune(r1);
    170 			r2 = toupperrune(r2);
    171 		}
    172 	} while (r1 && r1 == r2);
    173 
    174 	return r1 - r2;
    175 }
    176 
    177 static int
    178 slinecmp(struct line *a, struct line *b)
    179 {
    180 	int res = 0;
    181 	double x, y;
    182 	struct keydef *kd;
    183 
    184 	TAILQ_FOREACH(kd, &kdhead, entry) {
    185 		columns(a, kd, &col1);
    186 		columns(b, kd, &col2);
    187 
    188 		/* if -u is given, don't use default key definition
    189 		 * unless it is the only one */
    190 		if (uflag && kd == TAILQ_LAST(&kdhead, kdhead) &&
    191 		    TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) {
    192 			res = 0;
    193 		} else if (kd->flags & MOD_N) {
    194 			x = strtod(col1.line.data, NULL);
    195 			y = strtod(col2.line.data, NULL);
    196 			res = (x < y) ? -1 : (x > y);
    197 		} else if (kd->flags & (MOD_D | MOD_F | MOD_I)) {
    198 			res = skipmodcmp(&col1.line, &col2.line, kd->flags);
    199 		} else {
    200 			res = linecmp(&col1.line, &col2.line);
    201 		}
    202 
    203 		if (kd->flags & MOD_R)
    204 			res = -res;
    205 		if (res)
    206 			break;
    207 	}
    208 
    209 	return res;
    210 }
    211 
    212 static int
    213 check(FILE *fp, const char *fname)
    214 {
    215 	static struct line prev, cur, tmp;
    216 	static size_t prevsize, cursize, tmpsize;
    217 	ssize_t len;
    218 
    219 	if (!prev.data) {
    220 		if ((len = getline(&prev.data, &prevsize, fp)) < 0)
    221 			eprintf("getline:");
    222 		prev.len = len;
    223 	}
    224 	while ((len = getline(&cur.data, &cursize, fp)) > 0) {
    225 		cur.len = len;
    226 		if (uflag > slinecmp(&cur, &prev)) {
    227 			if (!Cflag) {
    228 				weprintf("disorder %s: ", fname);
    229 				fwrite(cur.data, 1, cur.len, stderr);
    230 			}
    231 			return 1;
    232 		}
    233 		tmp = cur;
    234 		tmpsize = cursize;
    235 		cur = prev;
    236 		cursize = prevsize;
    237 		prev = tmp;
    238 		prevsize = tmpsize;
    239 	}
    240 
    241 	return 0;
    242 }
    243 
    244 static int
    245 parse_flags(char **s, int *flags, int bflag)
    246 {
    247 	while (isalpha((int)**s)) {
    248 		switch (*((*s)++)) {
    249 		case 'b':
    250 			*flags |= bflag;
    251 			break;
    252 		case 'd':
    253 			*flags |= MOD_D;
    254 			break;
    255 		case 'f':
    256 			*flags |= MOD_F;
    257 			break;
    258 		case 'i':
    259 			*flags |= MOD_I;
    260 			break;
    261 		case 'n':
    262 			*flags |= MOD_N;
    263 			break;
    264 		case 'r':
    265 			*flags |= MOD_R;
    266 			break;
    267 		default:
    268 			return -1;
    269 		}
    270 	}
    271 
    272 	return 0;
    273 }
    274 
    275 static void
    276 addkeydef(char *kdstr, int flags)
    277 {
    278 	struct keydef *kd;
    279 
    280 	kd = enmalloc(2, sizeof(*kd));
    281 
    282 	/* parse key definition kdstr with format
    283 	 * start_column[.start_char][flags][,end_column[.end_char][flags]]
    284 	 */
    285 	kd->start_column = 1;
    286 	kd->start_char = 1;
    287 	kd->end_column = 0; /* 0 means end of line */
    288 	kd->end_char = 0;   /* 0 means end of column */
    289 	kd->flags = flags;
    290 
    291 	if ((kd->start_column = strtol(kdstr, &kdstr, 10)) < 1)
    292 		enprintf(2, "invalid start column in key definition\n");
    293 
    294 	if (*kdstr == '.') {
    295 		if ((kd->start_char = strtol(kdstr + 1, &kdstr, 10)) < 1)
    296 			enprintf(2, "invalid start character in key "
    297 			         "definition\n");
    298 	}
    299 	if (parse_flags(&kdstr, &kd->flags, MOD_STARTB) < 0)
    300 		enprintf(2, "invalid start flags in key definition\n");
    301 
    302 	if (*kdstr == ',') {
    303 		if ((kd->end_column = strtol(kdstr + 1, &kdstr, 10)) < 0)
    304 			enprintf(2, "invalid end column in key definition\n");
    305 		if (*kdstr == '.') {
    306 			if ((kd->end_char = strtol(kdstr + 1, &kdstr, 10)) < 0)
    307 				enprintf(2, "invalid end character in key "
    308 				         "definition\n");
    309 		}
    310 		if (parse_flags(&kdstr, &kd->flags, MOD_ENDB) < 0)
    311 			enprintf(2, "invalid end flags in key definition\n");
    312 	}
    313 
    314 	if (*kdstr != '\0')
    315 		enprintf(2, "invalid key definition\n");
    316 
    317 	TAILQ_INSERT_TAIL(&kdhead, kd, entry);
    318 }
    319 
    320 static void
    321 usage(void)
    322 {
    323 	enprintf(2, "usage: %s [-Cbcdfhimnru] [-o outfile] [-t delim] "
    324 	         "[-k def]... [file ...]\n", argv0);
    325 }
    326 
    327 int
    328 main(int argc, char *argv[])
    329 {
    330 	FILE *fp, *ofp = stdout;
    331 	struct linebuf linebuf = EMPTY_LINEBUF;
    332 	size_t i;
    333 	int global_flags = 0, ret = 0;
    334 	char *outfile = NULL;
    335 
    336 	ARGBEGIN {
    337 	case 'C':
    338 		Cflag = 1;
    339 		break;
    340 	case 'b':
    341 		global_flags |= MOD_STARTB | MOD_ENDB;
    342 		break;
    343 	case 'c':
    344 		cflag = 1;
    345 		break;
    346 	case 'd':
    347 		global_flags |= MOD_D;
    348 		break;
    349 	case 'f':
    350 		global_flags |= MOD_F;
    351 		break;
    352 	case 'h':
    353 		global_flags |= MOD_H;
    354 		break;
    355 	case 'i':
    356 		global_flags |= MOD_I;
    357 		break;
    358 	case 'k':
    359 		addkeydef(EARGF(usage()), global_flags);
    360 		break;
    361 	case 'm':
    362 		/* more or less for free, but for performance-reasons,
    363 		 * we should keep this flag in mind and maybe some later
    364 		 * day implement it properly so we don't run out of memory
    365 		 * while merging large sorted files.
    366 		 */
    367 		break;
    368 	case 'n':
    369 		global_flags |= MOD_N;
    370 		break;
    371 	case 'o':
    372 		outfile = EARGF(usage());
    373 		break;
    374 	case 'r':
    375 		global_flags |= MOD_R;
    376 		break;
    377 	case 't':
    378 		fieldsep = EARGF(usage());
    379 		if (!*fieldsep)
    380 			eprintf("empty delimiter\n");
    381 		fieldseplen = unescape(fieldsep);
    382 		break;
    383 	case 'u':
    384 		uflag = 1;
    385 		break;
    386 	default:
    387 		usage();
    388 	} ARGEND
    389 
    390 	/* -b shall only apply to custom key definitions */
    391 	if (TAILQ_EMPTY(&kdhead) && global_flags)
    392 		addkeydef("1", global_flags & ~(MOD_STARTB | MOD_ENDB));
    393 	addkeydef("1", global_flags & MOD_R);
    394 
    395 	if (!argc) {
    396 		if (Cflag || cflag) {
    397 			if (check(stdin, "<stdin>") && !ret)
    398 				ret = 1;
    399 		} else {
    400 			getlines(stdin, &linebuf);
    401 		}
    402 	} else for (; *argv; argc--, argv++) {
    403 		if (!strcmp(*argv, "-")) {
    404 			*argv = "<stdin>";
    405 			fp = stdin;
    406 		} else if (!(fp = fopen(*argv, "r"))) {
    407 			enprintf(2, "fopen %s:", *argv);
    408 			continue;
    409 		}
    410 		if (Cflag || cflag) {
    411 			if (check(fp, *argv) && !ret)
    412 				ret = 1;
    413 		} else {
    414 			getlines(fp, &linebuf);
    415 		}
    416 		if (fp != stdin && fshut(fp, *argv))
    417 			ret = 2;
    418 	}
    419 
    420 	if (!Cflag && !cflag) {
    421 		if (outfile && !(ofp = fopen(outfile, "w")))
    422 			eprintf("fopen %s:", outfile);
    423 
    424 		if ((global_flags & MOD_H) && linebuf.nlines > 1) {
    425 			qsort(linebuf.lines + 1, linebuf.nlines - 1, sizeof(*linebuf.lines),
    426 					(int (*)(const void *, const void *))slinecmp);
    427 		} else {
    428 			qsort(linebuf.lines, linebuf.nlines, sizeof(*linebuf.lines),
    429 					(int (*)(const void *, const void *))slinecmp);
    430 		}
    431 
    432 		for (i = 0; i < linebuf.nlines; i++) {
    433 			if (!uflag || i == 0 ||
    434 			    slinecmp(&linebuf.lines[i], &linebuf.lines[i - 1])) {
    435 				fwrite(linebuf.lines[i].data, 1,
    436 				       linebuf.lines[i].len, ofp);
    437 			}
    438 		}
    439 	}
    440 
    441 	if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>") |
    442 	    fshut(stderr, "<stderr>"))
    443 		ret = 2;
    444 
    445 	return ret;
    446 }