sed.c (41884B)
1 /* FIXME: summary 2 * decide whether we enforce valid UTF-8, right now it's enforced in certain 3 * parts of the script, but not the input... 4 * nul bytes cause explosions due to use of libc string functions. thoughts? 5 * lack of newline at end of file, currently we add one. what should we do? 6 * allow "\\t" for "\t" etc. in regex? in replacement text? 7 * POSIX says don't flush on N when out of input, but GNU and busybox do. 8 */ 9 10 #include <ctype.h> 11 #include <errno.h> 12 #include <regex.h> 13 #include <stdlib.h> 14 #include <string.h> 15 16 #include "utf.h" 17 #include "util.h" 18 19 /* Types */ 20 21 /* used as queue for writes and stack for {,:,b,t */ 22 typedef struct { 23 void **data; 24 size_t size; 25 size_t cap; 26 } Vec; 27 28 /* used for arbitrary growth, str is a C string 29 * FIXME: does it make sense to keep track of length? or just rely on libc 30 * string functions? If we want to support nul bytes everything changes 31 */ 32 typedef struct { 33 char *str; 34 size_t cap; 35 } String; 36 37 typedef struct Cmd Cmd; 38 typedef struct { 39 void (*fn)(Cmd *); 40 char *(*getarg)(Cmd *, char *); 41 void (*freearg)(Cmd *); 42 unsigned char naddr; 43 } Fninfo; 44 45 typedef struct { 46 union { 47 size_t lineno; 48 regex_t *re; 49 } u; 50 enum { 51 IGNORE, /* empty address, ignore */ 52 EVERY , /* every line */ 53 LINE , /* ilne number */ 54 LAST , /* last line ($) */ 55 REGEX , /* use included regex */ 56 LASTRE, /* use most recently used regex */ 57 } type; 58 } Addr; 59 60 /* DISCUSS: naddr is not strictly necessary, but very helpful 61 * naddr == 0 iff beg.type == EVERY && end.type == IGNORE 62 * naddr == 1 iff beg.type != IGNORE && end.type == IGNORE 63 * naddr == 2 iff beg.type != IGNORE && end.type != IGNORE 64 */ 65 typedef struct { 66 Addr beg; 67 Addr end; 68 unsigned char naddr; 69 } Range; 70 71 typedef struct { 72 regex_t *re; /* if NULL use last regex */ 73 String repl; 74 FILE *file; 75 size_t occurrence; /* 0 for all (g flag) */ 76 Rune delim; 77 unsigned int p:1; 78 } Sarg; 79 80 typedef struct { 81 Rune *set1; 82 Rune *set2; 83 } Yarg; 84 85 typedef struct { 86 String str; /* a,c,i text. r file path */ 87 void (*print)(char *, FILE *); /* check_puts for a, write_file for r, unused for c,i */ 88 } ACIRarg; 89 90 struct Cmd { 91 Range range; 92 Fninfo *fninfo; 93 union { 94 Cmd *jump; /* used for b,t when running */ 95 char *label; /* used for :,b,t when building */ 96 ptrdiff_t offset; /* used for { (pointers break during realloc) */ 97 FILE *file; /* used for w */ 98 99 /* FIXME: Should the following be in the union? or pointers and malloc? */ 100 Sarg s; 101 Yarg y; 102 ACIRarg acir; 103 } u; /* I find your lack of anonymous unions disturbing */ 104 unsigned int in_match:1; 105 unsigned int negate :1; 106 }; 107 108 /* Files for w command (and s' w flag) */ 109 typedef struct { 110 char *path; 111 FILE *file; 112 } Wfile; 113 114 /* 115 * Function Declarations 116 */ 117 118 /* Dynamically allocated arrays and strings */ 119 static void resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next); 120 static void *pop(Vec *v); 121 static void push(Vec *v, void *p); 122 static void stracat(String *dst, char *src); 123 static void strnacat(String *dst, char *src, size_t n); 124 static void stracpy(String *dst, char *src); 125 126 /* Cleanup and errors */ 127 static void usage(void); 128 129 /* Parsing functions and related utilities */ 130 static void compile(char *s, int isfile); 131 static int read_line(FILE *f, String *s); 132 static char *make_range(Range *range, char *s); 133 static char *make_addr(Addr *addr, char *s); 134 static char *find_delim(char *s, Rune delim, int do_brackets); 135 static char *chompr(char *s, Rune rune); 136 static char *chomp(char *s); 137 static Rune *strtorunes(char *s, size_t nrunes); 138 static long stol(char *s, char **endp); 139 static size_t escapes(char *beg, char *end, Rune delim, int n_newline); 140 static size_t echarntorune(Rune *r, char *s, size_t n); 141 static void insert_labels(void); 142 143 /* Get and Free arg and related utilities */ 144 static char *get_aci_arg(Cmd *c, char *s); 145 static void aci_append(Cmd *c, char *s); 146 static void free_acir_arg(Cmd *c); 147 static char *get_bt_arg(Cmd *c, char *s); 148 static char *get_r_arg(Cmd *c, char *s); 149 static char *get_s_arg(Cmd *c, char *s); 150 static void free_s_arg(Cmd *c); 151 static char *get_w_arg(Cmd *c, char *s); 152 static char *get_y_arg(Cmd *c, char *s); 153 static void free_y_arg(Cmd *c); 154 static char *get_colon_arg(Cmd *c, char *s); 155 static char *get_lbrace_arg(Cmd *c, char *s); 156 static char *get_rbrace_arg(Cmd *c, char *s); 157 static char *semicolon_arg(char *s); 158 159 /* Running */ 160 static void run(void); 161 static int in_range(Cmd *c); 162 static int match_addr(Addr *a); 163 static int next_file(void); 164 static int is_eof(FILE *f); 165 static void do_writes(void); 166 static void write_file(char *path, FILE *out); 167 static void check_puts(char *s, FILE *f); 168 static void update_ranges(Cmd *beg, Cmd *end); 169 170 /* Sed functions */ 171 static void cmd_y(Cmd *c); 172 static void cmd_x(Cmd *c); 173 static void cmd_w(Cmd *c); 174 static void cmd_t(Cmd *c); 175 static void cmd_s(Cmd *c); 176 static void cmd_r(Cmd *c); 177 static void cmd_q(Cmd *c); 178 static void cmd_P(Cmd *c); 179 static void cmd_p(Cmd *c); 180 static void cmd_N(Cmd *c); 181 static void cmd_n(Cmd *c); 182 static void cmd_l(Cmd *c); 183 static void cmd_i(Cmd *c); 184 static void cmd_H(Cmd *c); 185 static void cmd_h(Cmd *c); 186 static void cmd_G(Cmd *c); 187 static void cmd_g(Cmd *c); 188 static void cmd_D(Cmd *c); 189 static void cmd_d(Cmd *c); 190 static void cmd_c(Cmd *c); 191 static void cmd_b(Cmd *c); 192 static void cmd_a(Cmd *c); 193 static void cmd_colon(Cmd *c); 194 static void cmd_equal(Cmd *c); 195 static void cmd_lbrace(Cmd *c); 196 static void cmd_rbrace(Cmd *c); 197 static void cmd_last(Cmd *c); 198 199 /* Actions */ 200 static void new_line(void); 201 static void app_line(void); 202 static void new_next(void); 203 static void old_next(void); 204 205 /* 206 * Globals 207 */ 208 static Vec braces, labels, branches; /* holds ptrdiff_t. addrs of {, :, bt */ 209 static Vec writes; /* holds cmd*. writes scheduled by a and r commands */ 210 static Vec wfiles; /* holds Wfile*. files for w and s///w commands */ 211 212 static Cmd *prog, *pc; /* Program, program counter */ 213 static size_t pcap; 214 static size_t lineno; 215 216 static regex_t *lastre; /* last used regex for empty regex search */ 217 static char **files; /* list of file names from argv */ 218 static FILE *file; /* current file we are reading */ 219 220 static String patt, hold, genbuf; 221 222 static struct { 223 unsigned int n :1; /* -n (no print) */ 224 unsigned int E :1; /* -E (extended re) */ 225 unsigned int s :1; /* s/// replacement happened */ 226 unsigned int aci_cont:1; /* a,c,i text continuation */ 227 unsigned int s_cont :1; /* s/// replacement text continuation */ 228 unsigned int halt :1; /* halt execution */ 229 } gflags; 230 231 /* FIXME: move character inside Fninfo and only use 26*sizeof(Fninfo) instead of 127*sizeof(Fninfo) bytes */ 232 static Fninfo fns[] = { 233 ['a'] = { cmd_a , get_aci_arg , free_acir_arg , 1 }, /* schedule write of text for later */ 234 ['b'] = { cmd_b , get_bt_arg , NULL , 2 }, /* branch to label char *label when building, Cmd *jump when running */ 235 ['c'] = { cmd_c , get_aci_arg , free_acir_arg , 2 }, /* delete pattern space. at 0 or 1 addr or end of 2 addr, write text */ 236 ['d'] = { cmd_d , NULL , NULL , 2 }, /* delete pattern space */ 237 ['D'] = { cmd_D , NULL , NULL , 2 }, /* delete to first newline and start new cycle without reading (if no newline, d) */ 238 ['g'] = { cmd_g , NULL , NULL , 2 }, /* replace pattern space with hold space */ 239 ['G'] = { cmd_G , NULL , NULL , 2 }, /* append newline and hold space to pattern space */ 240 ['h'] = { cmd_h , NULL , NULL , 2 }, /* replace hold space with pattern space */ 241 ['H'] = { cmd_H , NULL , NULL , 2 }, /* append newline and pattern space to hold space */ 242 ['i'] = { cmd_i , get_aci_arg , free_acir_arg , 1 }, /* write text */ 243 ['l'] = { cmd_l , NULL , NULL , 2 }, /* write pattern space in 'visually unambiguous form' */ 244 ['n'] = { cmd_n , NULL , NULL , 2 }, /* write pattern space (unless -n) read to replace pattern space (if no input, quit) */ 245 ['N'] = { cmd_N , NULL , NULL , 2 }, /* append to pattern space separated by newline, line number changes (if no input, quit) */ 246 ['p'] = { cmd_p , NULL , NULL , 2 }, /* write pattern space */ 247 ['P'] = { cmd_P , NULL , NULL , 2 }, /* write pattern space up to first newline */ 248 ['q'] = { cmd_q , NULL , NULL , 1 }, /* quit */ 249 ['r'] = { cmd_r , get_r_arg , free_acir_arg , 1 }, /* write contents of file (unable to open/read treated as empty file) */ 250 ['s'] = { cmd_s , get_s_arg , free_s_arg , 2 }, /* find/replace/all that crazy s stuff */ 251 ['t'] = { cmd_t , get_bt_arg , NULL , 2 }, /* if s/// succeeded (since input or last t) branch to label (branch to end if no label) */ 252 ['w'] = { cmd_w , get_w_arg , NULL , 2 }, /* append pattern space to file */ 253 ['x'] = { cmd_x , NULL , NULL , 2 }, /* exchange pattern and hold spaces */ 254 ['y'] = { cmd_y , get_y_arg , free_y_arg , 2 }, /* replace runes in set1 with runes in set2 */ 255 [':'] = { cmd_colon , get_colon_arg , NULL , 0 }, /* defines label for later b and t commands */ 256 ['='] = { cmd_equal , NULL , NULL , 1 }, /* printf("%d\n", line_number); */ 257 ['{'] = { cmd_lbrace, get_lbrace_arg, NULL , 2 }, /* if we match, run commands, otherwise jump to close */ 258 ['}'] = { cmd_rbrace, get_rbrace_arg, NULL , 0 }, /* noop, hold onto open for ease of building scripts */ 259 260 [0x7f] = { NULL, NULL, NULL, 0 }, /* index is checked with isascii(3p). fill out rest of array */ 261 }; 262 263 /* 264 * Function Definitions 265 */ 266 267 /* given memory pointed to by *ptr that currently holds *nmemb members of size 268 * size, realloc to hold new_nmemb members, return new_nmemb in *memb and one 269 * past old end in *next. if realloc fails...explode 270 */ 271 static void 272 resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next) 273 { 274 void *n, *tmp; 275 276 if (new_nmemb) { 277 tmp = ereallocarray(*ptr, new_nmemb, size); 278 } else { /* turns out realloc(*ptr, 0) != free(*ptr) */ 279 free(*ptr); 280 tmp = NULL; 281 } 282 n = (char *)tmp + *nmemb * size; 283 *nmemb = new_nmemb; 284 *ptr = tmp; 285 if (next) 286 *next = n; 287 } 288 289 static void * 290 pop(Vec *v) 291 { 292 if (!v->size) 293 return NULL; 294 return v->data[--v->size]; 295 } 296 297 static void 298 push(Vec *v, void *p) 299 { 300 if (v->size == v->cap) 301 resize((void **)&v->data, &v->cap, sizeof(*v->data), v->cap * 2 + 1, NULL); 302 v->data[v->size++] = p; 303 } 304 305 static void 306 stracat(String *dst, char *src) 307 { 308 int new = !dst->cap; 309 size_t len; 310 311 len = (new ? 0 : strlen(dst->str)) + strlen(src) + 1; 312 if (dst->cap < len) 313 resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL); 314 if (new) 315 *dst->str = '\0'; 316 strcat(dst->str, src); 317 } 318 319 static void 320 strnacat(String *dst, char *src, size_t n) 321 { 322 int new = !dst->cap; 323 size_t len; 324 325 len = strlen(src); 326 len = (new ? 0 : strlen(dst->str)) + MIN(n, len) + 1; 327 if (dst->cap < len) 328 resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL); 329 if (new) 330 *dst->str = '\0'; 331 strlcat(dst->str, src, len); 332 } 333 334 static void 335 stracpy(String *dst, char *src) 336 { 337 size_t len; 338 339 len = strlen(src) + 1; 340 if (dst->cap < len) 341 resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL); 342 strcpy(dst->str, src); 343 } 344 345 static void 346 leprintf(char *s) 347 { 348 if (errno) 349 eprintf("%zu: %s: %s\n", lineno, s, strerror(errno)); 350 else 351 eprintf("%zu: %s\n", lineno, s); 352 } 353 354 /* FIXME: write usage message */ 355 static void 356 usage(void) 357 { 358 eprintf("usage: sed [-nrE] script [file ...]\n" 359 " sed [-nrE] -e script [-e script] ... [-f scriptfile] ... [file ...]\n" 360 " sed [-nrE] [-e script] ... -f scriptfile [-f scriptfile] ... [file ...]\n"); 361 } 362 363 /* Differences from POSIX 364 * we allows semicolons and trailing blanks inside {} 365 * we allow spaces after ! (and in between !s) 366 * we allow extended regular expressions (-E) 367 */ 368 static void 369 compile(char *s, int isfile) 370 { 371 FILE *f; 372 373 if (!isfile && !*s) /* empty string script */ 374 return; 375 376 f = isfile ? fopen(s, "r") : fmemopen(s, strlen(s), "r"); 377 if (!f) 378 eprintf("fopen/fmemopen:"); 379 380 /* NOTE: get arg functions can't use genbuf */ 381 while (read_line(f, &genbuf) != EOF) { 382 s = genbuf.str; 383 384 /* if the first two characters of the script are "#n" default output shall be suppressed */ 385 if (++lineno == 1 && *s == '#' && s[1] == 'n') { 386 gflags.n = 1; 387 continue; 388 } 389 390 if (gflags.aci_cont) { 391 aci_append(pc - 1, s); 392 continue; 393 } 394 if (gflags.s_cont) 395 s = (pc - 1)->fninfo->getarg(pc - 1, s); 396 397 while (*s) { 398 s = chompr(s, ';'); 399 if (!*s || *s == '#') 400 break; 401 402 if ((size_t)(pc - prog) == pcap) 403 resize((void **)&prog, &pcap, sizeof(*prog), pcap * 2 + 1, (void **)&pc); 404 405 pc->range.beg.type = pc->range.end.type = IGNORE; 406 pc->fninfo = NULL; 407 pc->in_match = 0; 408 409 s = make_range(&pc->range, s); 410 s = chomp(s); 411 pc->negate = *s == '!'; 412 s = chompr(s, '!'); 413 414 if (!isascii(*s) || !(pc->fninfo = &fns[(unsigned)*s])->fn) 415 leprintf("bad sed function"); 416 if (pc->range.naddr > pc->fninfo->naddr) 417 leprintf("wrong number of addresses"); 418 s++; 419 420 if (pc->fninfo->getarg) 421 s = pc->fninfo->getarg(pc, s); 422 423 pc++; 424 } 425 } 426 427 fshut(f, s); 428 } 429 430 /* FIXME: if we decide to honor lack of trailing newline, set/clear a global 431 * flag when reading a line 432 */ 433 static int 434 read_line(FILE *f, String *s) 435 { 436 ssize_t len; 437 438 if (!f) 439 return EOF; 440 441 if ((len = getline(&s->str, &s->cap, f)) < 0) { 442 if (ferror(f)) 443 eprintf("getline:"); 444 return EOF; 445 } 446 if (s->str[--len] == '\n') 447 s->str[len] = '\0'; 448 return 0; 449 } 450 451 /* read first range from s, return pointer to one past end of range */ 452 static char * 453 make_range(Range *range, char *s) 454 { 455 s = make_addr(&range->beg, s); 456 457 if (*s == ',') 458 s = make_addr(&range->end, s + 1); 459 else 460 range->end.type = IGNORE; 461 462 if (range->beg.type == EVERY && range->end.type == IGNORE) range->naddr = 0; 463 else if (range->beg.type != IGNORE && range->end.type == IGNORE) range->naddr = 1; 464 else if (range->beg.type != IGNORE && range->end.type != IGNORE) range->naddr = 2; 465 else leprintf("this is impossible..."); 466 467 return s; 468 } 469 470 /* read first addr from s, return pointer to one past end of addr */ 471 static char * 472 make_addr(Addr *addr, char *s) 473 { 474 Rune r; 475 char *p = s + strlen(s); 476 size_t rlen = echarntorune(&r, s, p - s); 477 478 if (r == '$') { 479 addr->type = LAST; 480 s += rlen; 481 } else if (isdigitrune(r)) { 482 addr->type = LINE; 483 addr->u.lineno = stol(s, &s); 484 } else if (r == '/' || r == '\\') { 485 Rune delim; 486 if (r == '\\') { 487 s += rlen; 488 rlen = echarntorune(&r, s, p - s); 489 } 490 if (r == '\\') 491 leprintf("bad delimiter '\\'"); 492 delim = r; 493 s += rlen; 494 rlen = echarntorune(&r, s, p - s); 495 if (r == delim) { 496 addr->type = LASTRE; 497 s += rlen; 498 } else { 499 addr->type = REGEX; 500 p = find_delim(s, delim, 1); 501 if (!*p) 502 leprintf("unclosed regex"); 503 p -= escapes(s, p, delim, 0); 504 *p++ = '\0'; 505 addr->u.re = emalloc(sizeof(*addr->u.re)); 506 eregcomp(addr->u.re, s, gflags.E ? REG_EXTENDED : 0); 507 s = p; 508 } 509 } else { 510 addr->type = EVERY; 511 } 512 513 return s; 514 } 515 516 /* return pointer to first delim in s that is not escaped 517 * and if do_brackets is set, not in [] (note possible [::], [..], [==], inside []) 518 * return pointer to trailing nul byte if no delim found 519 * 520 * any escaped character that is not special is just itself (POSIX undefined) 521 * FIXME: pull out into some util thing, will be useful for ed as well 522 */ 523 static char * 524 find_delim(char *s, Rune delim, int do_brackets) 525 { 526 enum { 527 OUTSIDE , /* not in brackets */ 528 BRACKETS_OPENING, /* last char was first [ or last two were first [^ */ 529 BRACKETS_INSIDE , /* inside [] */ 530 INSIDE_OPENING , /* inside [] and last char was [ */ 531 CLASS_INSIDE , /* inside class [::], or colating element [..] or [==], inside [] */ 532 CLASS_CLOSING , /* inside class [::], or colating element [..] or [==], and last character was the respective : . or = */ 533 } state = OUTSIDE; 534 535 Rune r, c = 0; /* no c won't be used uninitialized, shutup -Wall */ 536 size_t rlen; 537 int escape = 0; 538 char *end = s + strlen(s); 539 540 for (; *s; s += rlen) { 541 rlen = echarntorune(&r, s, end - s); 542 543 if (state == BRACKETS_OPENING && r == '^' ) { continue; } 544 else if (state == BRACKETS_OPENING && r == ']' ) { state = BRACKETS_INSIDE ; continue; } 545 else if (state == BRACKETS_OPENING ) { state = BRACKETS_INSIDE ; } 546 547 if (state == CLASS_CLOSING && r == ']' ) { state = BRACKETS_INSIDE ; } 548 else if (state == CLASS_CLOSING ) { state = CLASS_INSIDE ; } 549 else if (state == CLASS_INSIDE && r == c ) { state = CLASS_CLOSING ; } 550 else if (state == INSIDE_OPENING && (r == ':' || 551 r == '.' || 552 r == '=') ) { state = CLASS_INSIDE ; c = r; } 553 else if (state == INSIDE_OPENING && r == ']' ) { state = OUTSIDE ; } 554 else if (state == INSIDE_OPENING ) { state = BRACKETS_INSIDE ; } 555 else if (state == BRACKETS_INSIDE && r == '[' ) { state = INSIDE_OPENING ; } 556 else if (state == BRACKETS_INSIDE && r == ']' ) { state = OUTSIDE ; } 557 else if (state == OUTSIDE && escape ) { escape = 0 ; } 558 else if (state == OUTSIDE && r == '\\' ) { escape = 1 ; } 559 else if (state == OUTSIDE && r == delim) return s; 560 else if (state == OUTSIDE && do_brackets && r == '[' ) { state = BRACKETS_OPENING; } 561 } 562 return s; 563 } 564 565 static char * 566 chomp(char *s) 567 { 568 return chompr(s, 0); 569 } 570 571 /* eat all leading whitespace and occurrences of rune */ 572 static char * 573 chompr(char *s, Rune rune) 574 { 575 Rune r; 576 size_t rlen; 577 char *end = s + strlen(s); 578 579 while (*s && (rlen = echarntorune(&r, s, end - s)) && (isspacerune(r) || r == rune)) 580 s += rlen; 581 return s; 582 } 583 584 /* convert first nrunes Runes from UTF-8 string s in allocated Rune* 585 * NOTE: sequence must be valid UTF-8, check first */ 586 static Rune * 587 strtorunes(char *s, size_t nrunes) 588 { 589 Rune *rs, *rp; 590 591 rp = rs = ereallocarray(NULL, nrunes + 1, sizeof(*rs)); 592 593 while (nrunes--) 594 s += chartorune(rp++, s); 595 596 *rp = '\0'; 597 return rs; 598 } 599 600 static long 601 stol(char *s, char **endp) 602 { 603 long n; 604 errno = 0; 605 n = strtol(s, endp, 10); 606 607 if (errno) 608 leprintf("strtol:"); 609 if (*endp == s) 610 leprintf("strtol: invalid number"); 611 612 return n; 613 } 614 615 /* from beg to end replace "\\d" with "d" and "\\n" with "\n" (where d is delim) 616 * if delim is 'n' and n_newline is 0 then "\\n" is replaced with "n" (normal) 617 * if delim is 'n' and n_newline is 1 then "\\n" is replaced with "\n" (y command) 618 * if delim is 0 all escaped characters represent themselves (aci text) 619 * memmove rest of string (beyond end) into place 620 * return the number of converted escapes (backslashes removed) 621 * FIXME: this has had too many corner cases slapped on and is ugly. rewrite better 622 */ 623 static size_t 624 escapes(char *beg, char *end, Rune delim, int n_newline) 625 { 626 size_t num = 0; 627 char *src = beg, *dst = beg; 628 629 while (src < end) { 630 /* handle escaped backslash specially so we don't think the second 631 * backslash is escaping something */ 632 if (*src == '\\' && src[1] == '\\') { 633 *dst++ = *src++; 634 if (delim) 635 *dst++ = *src++; 636 else 637 src++; 638 } else if (*src == '\\' && !delim) { 639 src++; 640 } else if (*src == '\\' && src[1]) { 641 Rune r; 642 size_t rlen; 643 num++; 644 src++; 645 rlen = echarntorune(&r, src, end - src); 646 647 if (r == 'n' && delim == 'n') { 648 *src = n_newline ? '\n' : 'n'; /* src so we can still memmove() */ 649 } else if (r == 'n') { 650 *src = '\n'; 651 } else if (r != delim) { 652 *dst++ = '\\'; 653 num--; 654 } 655 656 memmove(dst, src, rlen); 657 dst += rlen; 658 src += rlen; 659 } else { 660 *dst++ = *src++; 661 } 662 } 663 memmove(dst, src, strlen(src) + 1); 664 return num; 665 } 666 667 static size_t 668 echarntorune(Rune *r, char *s, size_t n) 669 { 670 size_t rlen = charntorune(r, s, n); 671 if (!rlen || *r == Runeerror) 672 leprintf("invalid UTF-8"); 673 return rlen; 674 } 675 676 static void 677 insert_labels(void) 678 { 679 size_t i; 680 Cmd *from, *to; 681 682 while (branches.size) { 683 from = prog + (ptrdiff_t)pop(&branches); 684 685 if (!from->u.label) {/* no label branch to end of script */ 686 from->u.jump = pc - 1; 687 } else { 688 for (i = 0; i < labels.size; i++) { 689 to = prog + (ptrdiff_t)labels.data[i]; 690 if (!strcmp(from->u.label, to->u.label)) { 691 from->u.jump = to; 692 break; 693 } 694 } 695 if (i == labels.size) 696 leprintf("bad label"); 697 } 698 } 699 } 700 701 /* 702 * Getargs / Freeargs 703 * Read argument from s, return pointer to one past last character of argument 704 */ 705 706 /* POSIX compliant 707 * i\ 708 * foobar 709 * 710 * also allow the following non POSIX compliant 711 * i # empty line 712 * ifoobar 713 * ifoobar\ 714 * baz 715 * 716 * FIXME: GNU and busybox discard leading spaces 717 * i foobar 718 * i foobar 719 * ifoobar 720 * are equivalent in GNU and busybox. We don't. Should we? 721 */ 722 static char * 723 get_aci_arg(Cmd *c, char *s) 724 { 725 c->u.acir.print = check_puts; 726 c->u.acir.str = (String){ NULL, 0 }; 727 728 gflags.aci_cont = !!*s; /* no continue flag if empty string */ 729 730 /* neither empty string nor POSIX compliant */ 731 if (*s && !(*s == '\\' && !s[1])) 732 aci_append(c, s); 733 734 return s + strlen(s); 735 } 736 737 static void 738 aci_append(Cmd *c, char *s) 739 { 740 char *end = s + strlen(s), *p = end; 741 742 gflags.aci_cont = 0; 743 while (--p >= s && *p == '\\') 744 gflags.aci_cont = !gflags.aci_cont; 745 746 if (gflags.aci_cont) 747 *--end = '\n'; 748 749 escapes(s, end, 0, 0); 750 stracat(&c->u.acir.str, s); 751 } 752 753 static void 754 free_acir_arg(Cmd *c) 755 { 756 free(c->u.acir.str.str); 757 } 758 759 /* POSIX dictates that label is rest of line, including semicolons, trailing 760 * whitespace, closing braces, etc. and can be limited to 8 bytes 761 * 762 * I allow a semicolon or closing brace to terminate a label name, it's not 763 * POSIX compliant, but it's useful and every sed version I've tried to date 764 * does the same. 765 * 766 * FIXME: POSIX dictates that leading whitespace is ignored but trailing 767 * whitespace is not. This is annoying and we should probably get rid of it. 768 */ 769 static char * 770 get_bt_arg(Cmd *c, char *s) 771 { 772 char *p = semicolon_arg(s = chomp(s)); 773 774 if (p != s) { 775 c->u.label = estrndup(s, p - s); 776 } else { 777 c->u.label = NULL; 778 } 779 780 push(&branches, (void *)(c - prog)); 781 782 return p; 783 } 784 785 /* POSIX dictates file name is rest of line including semicolons, trailing 786 * whitespace, closing braces, etc. and file name must be preceded by a space 787 * 788 * I allow a semicolon or closing brace to terminate a file name and don't 789 * enforce leading space. 790 * 791 * FIXME: decide whether trailing whitespace should be included and fix 792 * accordingly 793 */ 794 static char * 795 get_r_arg(Cmd *c, char *s) 796 { 797 char *p = semicolon_arg(s = chomp(s)); 798 799 if (p == s) 800 leprintf("no file name"); 801 802 c->u.acir.str.str = estrndup(s, p - s); 803 c->u.acir.print = write_file; 804 805 return p; 806 } 807 808 /* we allow "\\n" in replacement text to mean "\n" (undefined in POSIX) 809 * 810 * FIXME: allow other escapes in regex and replacement? if so change escapes() 811 */ 812 static char * 813 get_s_arg(Cmd *c, char *s) 814 { 815 Rune delim, r; 816 Cmd buf; 817 char *p; 818 int esc, lastre; 819 820 /* s/Find/Replace/Flags */ 821 822 /* Find */ 823 if (!gflags.s_cont) { /* NOT continuing from literal newline in replacement text */ 824 lastre = 0; 825 c->u.s.repl = (String){ NULL, 0 }; 826 c->u.s.occurrence = 1; 827 c->u.s.file = NULL; 828 c->u.s.p = 0; 829 830 if (!*s || *s == '\\') 831 leprintf("bad delimiter"); 832 833 p = s + strlen(s); 834 s += echarntorune(&delim, s, p - s); 835 c->u.s.delim = delim; 836 837 echarntorune(&r, s, p - s); 838 if (r == delim) /* empty regex */ 839 lastre = 1; 840 841 p = find_delim(s, delim, 1); 842 if (!*p) 843 leprintf("missing second delimiter"); 844 p -= escapes(s, p, delim, 0); 845 *p = '\0'; 846 847 if (lastre) { 848 c->u.s.re = NULL; 849 } else { 850 c->u.s.re = emalloc(sizeof(*c->u.s.re)); 851 /* FIXME: different eregcomp that calls fatal */ 852 eregcomp(c->u.s.re, s, gflags.E ? REG_EXTENDED : 0); 853 } 854 s = p + runelen(delim); 855 } 856 857 /* Replace */ 858 delim = c->u.s.delim; 859 860 p = find_delim(s, delim, 0); 861 p -= escapes(s, p, delim, 0); 862 if (!*p) { /* no third delimiter */ 863 /* FIXME: same backslash counting as aci_append() */ 864 if (p[-1] != '\\') 865 leprintf("missing third delimiter or <backslash><newline>"); 866 p[-1] = '\n'; 867 gflags.s_cont = 1; 868 } else { 869 gflags.s_cont = 0; 870 } 871 872 /* check for bad references in replacement text */ 873 *p = '\0'; 874 for (esc = 0, p = s; *p; p++) { 875 if (esc) { 876 esc = 0; 877 if (isdigit(*p) && c->u.s.re && (size_t)(*p - '0') > c->u.s.re->re_nsub) 878 leprintf("back reference number greater than number of groups"); 879 } else if (*p == '\\') { 880 esc = 1; 881 } 882 } 883 stracat(&c->u.s.repl, s); 884 885 if (gflags.s_cont) 886 return p; 887 888 s = p + runelen(delim); 889 890 /* Flags */ 891 p = semicolon_arg(s = chomp(s)); 892 893 /* FIXME: currently for simplicity take last of g or occurrence flags and 894 * ignore multiple p flags. need to fix that */ 895 for (; s < p; s++) { 896 if (isdigit(*s)) { 897 c->u.s.occurrence = stol(s, &s); 898 s--; /* for loop will advance pointer */ 899 } else { 900 switch (*s) { 901 case 'g': c->u.s.occurrence = 0; break; 902 case 'p': c->u.s.p = 1; break; 903 case 'w': 904 /* must be last flag, take everything up to newline/semicolon 905 * s == p after this */ 906 s = get_w_arg(&buf, chomp(s+1)); 907 c->u.s.file = buf.u.file; 908 break; 909 } 910 } 911 } 912 return p; 913 } 914 915 static void 916 free_s_arg(Cmd *c) 917 { 918 if (c->u.s.re) 919 regfree(c->u.s.re); 920 free(c->u.s.re); 921 free(c->u.s.repl.str); 922 } 923 924 /* see get_r_arg notes */ 925 static char * 926 get_w_arg(Cmd *c, char *s) 927 { 928 char *p = semicolon_arg(s = chomp(s)); 929 Wfile *w, **wp; 930 931 if (p == s) 932 leprintf("no file name"); 933 934 for (wp = (Wfile **)wfiles.data; (size_t)(wp - (Wfile **)wfiles.data) < wfiles.size; wp++) { 935 if (strlen((*wp)->path) == (size_t)(p - s) && !strncmp(s, (*wp)->path, p - s)) { 936 c->u.file = (*wp)->file; 937 return p; 938 } 939 } 940 941 w = emalloc(sizeof(*w)); 942 w->path = estrndup(s, p - s); 943 944 if (!(w->file = fopen(w->path, "w"))) 945 leprintf("fopen failed"); 946 947 c->u.file = w->file; 948 949 push(&wfiles, w); 950 return p; 951 } 952 953 static char * 954 get_y_arg(Cmd *c, char *s) 955 { 956 Rune delim; 957 char *p = s + strlen(s); 958 size_t rlen = echarntorune(&delim, s, p - s); 959 size_t nrunes1, nrunes2; 960 961 c->u.y.set1 = c->u.y.set2 = NULL; 962 963 s += rlen; 964 p = find_delim(s, delim, 0); 965 p -= escapes(s, p, delim, 1); 966 nrunes1 = utfnlen(s, p - s); 967 c->u.y.set1 = strtorunes(s, nrunes1); 968 969 s = p + rlen; 970 p = find_delim(s, delim, 0); 971 p -= escapes(s, p, delim, 1); 972 nrunes2 = utfnlen(s, p - s); 973 974 if (nrunes1 != nrunes2) 975 leprintf("different set lengths"); 976 977 c->u.y.set2 = strtorunes(s, utfnlen(s, p - s)); 978 979 return p + rlen; 980 } 981 982 static void 983 free_y_arg(Cmd *c) 984 { 985 free(c->u.y.set1); 986 free(c->u.y.set2); 987 } 988 989 /* see get_bt_arg notes */ 990 static char * 991 get_colon_arg(Cmd *c, char *s) 992 { 993 char *p = semicolon_arg(s = chomp(s)); 994 995 if (p == s) 996 leprintf("no label name"); 997 998 c->u.label = estrndup(s, p - s); 999 push(&labels, (void *)(c - prog)); 1000 return p; 1001 } 1002 1003 static char * 1004 get_lbrace_arg(Cmd *c, char *s) 1005 { 1006 push(&braces, (void *)(c - prog)); 1007 return s; 1008 } 1009 1010 static char * 1011 get_rbrace_arg(Cmd *c, char *s) 1012 { 1013 Cmd *lbrace; 1014 1015 if (!braces.size) 1016 leprintf("extra }"); 1017 1018 lbrace = prog + (ptrdiff_t)pop(&braces); 1019 lbrace->u.offset = c - prog; 1020 return s; 1021 } 1022 1023 /* s points to beginning of an argument that may be semicolon terminated 1024 * return pointer to semicolon or nul byte after string 1025 * or closing brace as to not force ; before } 1026 * FIXME: decide whether or not to eat trailing whitespace for arguments that 1027 * we allow semicolon/brace termination that POSIX doesn't 1028 * b, r, t, w, : 1029 * POSIX says trailing whitespace is part of label name, file name, etc. 1030 * we should probably eat it 1031 */ 1032 static char * 1033 semicolon_arg(char *s) 1034 { 1035 char *p = strpbrk(s, ";}"); 1036 if (!p) 1037 p = s + strlen(s); 1038 return p; 1039 } 1040 1041 static void 1042 run(void) 1043 { 1044 lineno = 0; 1045 if (braces.size) 1046 leprintf("extra {"); 1047 1048 /* genbuf has already been initialized, patt will be in new_line 1049 * (or we'll halt) */ 1050 stracpy(&hold, ""); 1051 1052 insert_labels(); 1053 next_file(); 1054 new_line(); 1055 1056 for (pc = prog; !gflags.halt; pc++) 1057 pc->fninfo->fn(pc); 1058 } 1059 1060 /* return true if we are in range for c, set c->in_match appropriately */ 1061 static int 1062 in_range(Cmd *c) 1063 { 1064 if (match_addr(&c->range.beg)) { 1065 if (c->range.naddr == 2) { 1066 if (c->range.end.type == LINE && c->range.end.u.lineno <= lineno) 1067 c->in_match = 0; 1068 else 1069 c->in_match = 1; 1070 } 1071 return !c->negate; 1072 } 1073 if (c->in_match && match_addr(&c->range.end)) { 1074 c->in_match = 0; 1075 return !c->negate; 1076 } 1077 return c->in_match ^ c->negate; 1078 } 1079 1080 /* return true if addr matches current line */ 1081 static int 1082 match_addr(Addr *a) 1083 { 1084 switch (a->type) { 1085 default: 1086 case IGNORE: return 0; 1087 case EVERY: return 1; 1088 case LINE: return lineno == a->u.lineno; 1089 case LAST: 1090 while (is_eof(file) && !next_file()) 1091 ; 1092 return !file; 1093 case REGEX: 1094 lastre = a->u.re; 1095 return !regexec(a->u.re, patt.str, 0, NULL, 0); 1096 case LASTRE: 1097 if (!lastre) 1098 leprintf("no previous regex"); 1099 return !regexec(lastre, patt.str, 0, NULL, 0); 1100 } 1101 } 1102 1103 /* move to next input file 1104 * stdin if first call and no files 1105 * return 0 for success and 1 for no more files 1106 */ 1107 static int 1108 next_file(void) 1109 { 1110 static unsigned char first = 1; 1111 1112 if (file == stdin) 1113 clearerr(file); 1114 else if (file) 1115 fshut(file, "<file>"); 1116 file = NULL; 1117 1118 do { 1119 if (!*files) { 1120 if (first) /* given no files, default to stdin */ 1121 file = stdin; 1122 /* else we've used all our files, leave file = NULL */ 1123 } else if (!strcmp(*files, "-")) { 1124 file = stdin; 1125 files++; 1126 } else if (!(file = fopen(*files++, "r"))) { 1127 /* warn this file didn't open, but move on to next */ 1128 weprintf("fopen:"); 1129 } 1130 } while (!file && *files); 1131 first = 0; 1132 1133 return !file; 1134 } 1135 1136 /* test if stream is at EOF */ 1137 static int 1138 is_eof(FILE *f) 1139 { 1140 int c; 1141 1142 if (!f || feof(f)) 1143 return 1; 1144 1145 c = fgetc(f); 1146 if (c == EOF && ferror(f)) 1147 eprintf("fgetc:"); 1148 if (c != EOF && ungetc(c, f) == EOF) 1149 eprintf("ungetc EOF\n"); 1150 1151 return c == EOF; 1152 } 1153 1154 /* perform writes that were scheduled 1155 * for aci this is check_puts(string, stdout) 1156 * for r this is write_file(path, stdout) 1157 */ 1158 static void 1159 do_writes(void) 1160 { 1161 Cmd *c; 1162 size_t i; 1163 1164 for (i = 0; i < writes.size; i++) { 1165 c = writes.data[i]; 1166 c->u.acir.print(c->u.acir.str.str, stdout); 1167 } 1168 writes.size = 0; 1169 } 1170 1171 /* used for r's u.acir.print() 1172 * FIXME: something like util's concat() would be better 1173 */ 1174 static void 1175 write_file(char *path, FILE *out) 1176 { 1177 FILE *in = fopen(path, "r"); 1178 if (!in) /* no file is treated as empty file */ 1179 return; 1180 1181 while (read_line(in, &genbuf) != EOF) 1182 check_puts(genbuf.str, out); 1183 1184 fshut(in, path); 1185 } 1186 1187 static void 1188 check_puts(char *s, FILE *f) 1189 { 1190 if (s && fputs(s, f) == EOF) 1191 eprintf("fputs:"); 1192 if (fputs("\n", f) == EOF) 1193 eprintf("fputs:"); 1194 } 1195 1196 /* iterate from beg to end updating ranges so we don't miss any commands 1197 * e.g. sed -n '1d;1,3p' should still print lines 2 and 3 1198 */ 1199 static void 1200 update_ranges(Cmd *beg, Cmd *end) 1201 { 1202 while (beg < end) 1203 in_range(beg++); 1204 } 1205 1206 /* 1207 * Sed functions 1208 */ 1209 static void 1210 cmd_a(Cmd *c) 1211 { 1212 if (in_range(c)) 1213 push(&writes, c); 1214 } 1215 1216 static void 1217 cmd_b(Cmd *c) 1218 { 1219 if (!in_range(c)) 1220 return; 1221 1222 /* if we jump backwards update to end, otherwise update to destination */ 1223 update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap); 1224 pc = c->u.jump; 1225 } 1226 1227 static void 1228 cmd_c(Cmd *c) 1229 { 1230 if (!in_range(c)) 1231 return; 1232 1233 /* write the text on the last line of the match */ 1234 if (!c->in_match) 1235 check_puts(c->u.acir.str.str, stdout); 1236 /* otherwise start the next cycle without printing pattern space 1237 * effectively deleting the text */ 1238 new_next(); 1239 } 1240 1241 static void 1242 cmd_d(Cmd *c) 1243 { 1244 if (!in_range(c)) 1245 return; 1246 1247 new_next(); 1248 } 1249 1250 static void 1251 cmd_D(Cmd *c) 1252 { 1253 char *p; 1254 1255 if (!in_range(c)) 1256 return; 1257 1258 if ((p = strchr(patt.str, '\n'))) { 1259 p++; 1260 memmove(patt.str, p, strlen(p) + 1); 1261 old_next(); 1262 } else { 1263 new_next(); 1264 } 1265 } 1266 1267 static void 1268 cmd_g(Cmd *c) 1269 { 1270 if (in_range(c)) 1271 stracpy(&patt, hold.str); 1272 } 1273 1274 static void 1275 cmd_G(Cmd *c) 1276 { 1277 if (!in_range(c)) 1278 return; 1279 1280 stracat(&patt, "\n"); 1281 stracat(&patt, hold.str); 1282 } 1283 1284 static void 1285 cmd_h(Cmd *c) 1286 { 1287 if (in_range(c)) 1288 stracpy(&hold, patt.str); 1289 } 1290 1291 static void 1292 cmd_H(Cmd *c) 1293 { 1294 if (!in_range(c)) 1295 return; 1296 1297 stracat(&hold, "\n"); 1298 stracat(&hold, patt.str); 1299 } 1300 1301 static void 1302 cmd_i(Cmd *c) 1303 { 1304 if (in_range(c)) 1305 check_puts(c->u.acir.str.str, stdout); 1306 } 1307 1308 /* I think it makes sense to print invalid UTF-8 sequences in octal to satisfy 1309 * the "visually unambiguous form" sed(1p) 1310 */ 1311 static void 1312 cmd_l(Cmd *c) 1313 { 1314 Rune r; 1315 char *p, *end; 1316 size_t rlen; 1317 1318 char *escapes[] = { /* FIXME: 7 entries and search instead of 127 */ 1319 ['\\'] = "\\\\", ['\a'] = "\\a", ['\b'] = "\\b", 1320 ['\f'] = "\\f" , ['\r'] = "\\r", ['\t'] = "\\t", 1321 ['\v'] = "\\v" , [0x7f] = NULL, /* fill out the table */ 1322 }; 1323 1324 if (!in_range(c)) 1325 return; 1326 1327 /* FIXME: line wrapping. sed(1p) says "length at which folding occurs is 1328 * unspecified, but should be appropraite for the output device" 1329 * just wrap at 80 Runes? 1330 */ 1331 for (p = patt.str, end = p + strlen(p); p < end; p += rlen) { 1332 if (isascii(*p) && escapes[(unsigned int)*p]) { 1333 fputs(escapes[(unsigned int)*p], stdout); 1334 rlen = 1; 1335 } else if (!(rlen = charntorune(&r, p, end - p))) { 1336 /* ran out of chars, print the bytes of the short sequence */ 1337 for (; p < end; p++) 1338 printf("\\%03hho", (unsigned char)*p); 1339 break; 1340 } else if (r == Runeerror) { 1341 for (; rlen; rlen--, p++) 1342 printf("\\%03hho", (unsigned char)*p); 1343 } else { 1344 while (fwrite(p, rlen, 1, stdout) < 1 && errno == EINTR) 1345 ; 1346 if (ferror(stdout)) 1347 eprintf("fwrite:"); 1348 } 1349 } 1350 check_puts("$", stdout); 1351 } 1352 1353 static void 1354 cmd_n(Cmd *c) 1355 { 1356 if (!in_range(c)) 1357 return; 1358 1359 if (!gflags.n) 1360 check_puts(patt.str, stdout); 1361 do_writes(); 1362 new_line(); 1363 } 1364 1365 static void 1366 cmd_N(Cmd *c) 1367 { 1368 if (!in_range(c)) 1369 return; 1370 do_writes(); 1371 app_line(); 1372 } 1373 1374 static void 1375 cmd_p(Cmd *c) 1376 { 1377 if (in_range(c)) 1378 check_puts(patt.str, stdout); 1379 } 1380 1381 static void 1382 cmd_P(Cmd *c) 1383 { 1384 char *p; 1385 1386 if (!in_range(c)) 1387 return; 1388 1389 if ((p = strchr(patt.str, '\n'))) 1390 *p = '\0'; 1391 1392 check_puts(patt.str, stdout); 1393 1394 if (p) 1395 *p = '\n'; 1396 } 1397 1398 static void 1399 cmd_q(Cmd *c) 1400 { 1401 if (!in_range(c)) 1402 return; 1403 1404 if (!gflags.n) 1405 check_puts(patt.str, stdout); 1406 do_writes(); 1407 gflags.halt = 1; 1408 } 1409 1410 static void 1411 cmd_r(Cmd *c) 1412 { 1413 if (in_range(c)) 1414 push(&writes, c); 1415 } 1416 1417 static void 1418 cmd_s(Cmd *c) 1419 { 1420 String tmp; 1421 Rune r; 1422 size_t plen, rlen, len; 1423 char *p, *s, *end; 1424 unsigned int matches = 0, last_empty = 1, qflag = 0, cflags = 0; 1425 regex_t *re; 1426 regmatch_t *rm, *pmatch = NULL; 1427 1428 if (!in_range(c)) 1429 return; 1430 1431 if (!c->u.s.re && !lastre) 1432 leprintf("no previous regex"); 1433 1434 re = c->u.s.re ? c->u.s.re : lastre; 1435 lastre = re; 1436 1437 plen = re->re_nsub + 1; 1438 pmatch = ereallocarray(NULL, plen, sizeof(regmatch_t)); 1439 1440 *genbuf.str = '\0'; 1441 s = patt.str; 1442 1443 while (!qflag && !regexec(re, s, plen, pmatch, cflags)) { 1444 cflags = REG_NOTBOL; /* match against beginning of line first time, but not again */ 1445 if (!*s) /* match against empty string first time, but not again */ 1446 qflag = 1; 1447 1448 /* don't substitute if last match was not empty but this one is. 1449 * s_a*_._g 1450 * foobar -> .f.o.o.b.r. 1451 */ 1452 if ((last_empty || pmatch[0].rm_eo) && 1453 (++matches == c->u.s.occurrence || !c->u.s.occurrence)) { 1454 /* copy over everything before the match */ 1455 strnacat(&genbuf, s, pmatch[0].rm_so); 1456 1457 /* copy over replacement text, taking into account &, backreferences, and \ escapes */ 1458 for (p = c->u.s.repl.str, len = strcspn(p, "\\&"); *p; len = strcspn(++p, "\\&")) { 1459 strnacat(&genbuf, p, len); 1460 p += len; 1461 switch (*p) { 1462 default: leprintf("this shouldn't be possible"); 1463 case '\0': 1464 /* we're at the end, back up one so the ++p will put us on 1465 * the null byte to break out of the loop */ 1466 --p; 1467 break; 1468 case '&': 1469 strnacat(&genbuf, s + pmatch[0].rm_so, pmatch[0].rm_eo - pmatch[0].rm_so); 1470 break; 1471 case '\\': 1472 if (isdigit(*++p)) { /* backreference */ 1473 /* only need to check here if using lastre, otherwise we checked when building */ 1474 if (!c->u.s.re && (size_t)(*p - '0') > re->re_nsub) 1475 leprintf("back reference number greater than number of groups"); 1476 rm = &pmatch[*p - '0']; 1477 strnacat(&genbuf, s + rm->rm_so, rm->rm_eo - rm->rm_so); 1478 } else { /* character after backslash taken literally (well one byte, but it works) */ 1479 strnacat(&genbuf, p, 1); 1480 } 1481 break; 1482 } 1483 } 1484 } else { 1485 /* not replacing, copy over everything up to and including the match */ 1486 strnacat(&genbuf, s, pmatch[0].rm_eo); 1487 } 1488 1489 if (!pmatch[0].rm_eo) { /* empty match, advance one rune and add it to output */ 1490 end = s + strlen(s); 1491 rlen = charntorune(&r, s, end - s); 1492 1493 if (!rlen) { /* ran out of bytes, copy short sequence */ 1494 stracat(&genbuf, s); 1495 s = end; 1496 } else { /* copy whether or not it's a good rune */ 1497 strnacat(&genbuf, s, rlen); 1498 s += rlen; 1499 } 1500 } 1501 last_empty = !pmatch[0].rm_eo; 1502 s += pmatch[0].rm_eo; 1503 } 1504 free(pmatch); 1505 1506 if (!(matches && matches >= c->u.s.occurrence)) /* no replacement */ 1507 return; 1508 1509 gflags.s = 1; 1510 1511 stracat(&genbuf, s); 1512 1513 tmp = patt; 1514 patt = genbuf; 1515 genbuf = tmp; 1516 1517 if (c->u.s.p) 1518 check_puts(patt.str, stdout); 1519 if (c->u.s.file) 1520 check_puts(patt.str, c->u.s.file); 1521 } 1522 1523 static void 1524 cmd_t(Cmd *c) 1525 { 1526 if (!in_range(c) || !gflags.s) 1527 return; 1528 1529 /* if we jump backwards update to end, otherwise update to destination */ 1530 update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap); 1531 pc = c->u.jump; 1532 gflags.s = 0; 1533 } 1534 1535 static void 1536 cmd_w(Cmd *c) 1537 { 1538 if (in_range(c)) 1539 check_puts(patt.str, c->u.file); 1540 } 1541 1542 static void 1543 cmd_x(Cmd *c) 1544 { 1545 String tmp; 1546 1547 if (!in_range(c)) 1548 return; 1549 1550 tmp = patt; 1551 patt = hold; 1552 hold = tmp; 1553 } 1554 1555 static void 1556 cmd_y(Cmd *c) 1557 { 1558 String tmp; 1559 Rune r, *rp; 1560 size_t n, rlen; 1561 char *s, *end, buf[UTFmax]; 1562 1563 if (!in_range(c)) 1564 return; 1565 1566 *genbuf.str = '\0'; 1567 for (s = patt.str, end = s + strlen(s); *s; s += rlen) { 1568 if (!(rlen = charntorune(&r, s, end - s))) { /* ran out of chars, copy rest */ 1569 stracat(&genbuf, s); 1570 break; 1571 } else if (r == Runeerror) { /* bad UTF-8 sequence, copy bytes */ 1572 strnacat(&genbuf, s, rlen); 1573 } else { 1574 for (rp = c->u.y.set1; *rp; rp++) 1575 if (*rp == r) 1576 break; 1577 if (*rp) { /* found r in set1, replace with Rune from set2 */ 1578 n = runetochar(buf, c->u.y.set2 + (rp - c->u.y.set1)); 1579 strnacat(&genbuf, buf, n); 1580 } else { 1581 strnacat(&genbuf, s, rlen); 1582 } 1583 } 1584 } 1585 tmp = patt; 1586 patt = genbuf; 1587 genbuf = tmp; 1588 } 1589 1590 static void 1591 cmd_colon(Cmd *c) 1592 { 1593 } 1594 1595 static void 1596 cmd_equal(Cmd *c) 1597 { 1598 if (in_range(c)) 1599 printf("%zu\n", lineno); 1600 } 1601 1602 static void 1603 cmd_lbrace(Cmd *c) 1604 { 1605 Cmd *jump; 1606 1607 if (in_range(c)) 1608 return; 1609 1610 /* update ranges on all commands we skip */ 1611 jump = prog + c->u.offset; 1612 update_ranges(c + 1, jump); 1613 pc = jump; 1614 } 1615 1616 static void 1617 cmd_rbrace(Cmd *c) 1618 { 1619 } 1620 1621 /* not actually a sed function, but acts like one, put in last spot of script */ 1622 static void 1623 cmd_last(Cmd *c) 1624 { 1625 if (!gflags.n) 1626 check_puts(patt.str, stdout); 1627 do_writes(); 1628 new_next(); 1629 } 1630 1631 /* 1632 * Actions 1633 */ 1634 1635 /* read new line, continue current cycle */ 1636 static void 1637 new_line(void) 1638 { 1639 while (read_line(file, &patt) == EOF) { 1640 if (next_file()) { 1641 gflags.halt = 1; 1642 return; 1643 } 1644 } 1645 gflags.s = 0; 1646 lineno++; 1647 } 1648 1649 /* append new line, continue current cycle 1650 * FIXME: used for N, POSIX specifies do not print pattern space when out of 1651 * input, but GNU does so busybox does as well. Currently we don't. 1652 * Should we? 1653 */ 1654 static void 1655 app_line(void) 1656 { 1657 while (read_line(file, &genbuf) == EOF) { 1658 if (next_file()) { 1659 gflags.halt = 1; 1660 return; 1661 } 1662 } 1663 1664 stracat(&patt, "\n"); 1665 stracat(&patt, genbuf.str); 1666 gflags.s = 0; 1667 lineno++; 1668 } 1669 1670 /* read new line, start new cycle */ 1671 static void 1672 new_next(void) 1673 { 1674 *patt.str = '\0'; 1675 update_ranges(pc + 1, prog + pcap); 1676 new_line(); 1677 pc = prog - 1; 1678 } 1679 1680 /* keep old pattern space, start new cycle */ 1681 static void 1682 old_next(void) 1683 { 1684 update_ranges(pc + 1, prog + pcap); 1685 pc = prog - 1; 1686 } 1687 1688 int 1689 main(int argc, char *argv[]) 1690 { 1691 char *arg; 1692 int ret = 0, script = 0; 1693 1694 ARGBEGIN { 1695 case 'n': 1696 gflags.n = 1; 1697 break; 1698 case 'r': 1699 case 'E': 1700 gflags.E = 1; 1701 break; 1702 case 'e': 1703 arg = EARGF(usage()); 1704 compile(arg, 0); 1705 script = 1; 1706 break; 1707 case 'f': 1708 arg = EARGF(usage()); 1709 compile(arg, 1); 1710 script = 1; 1711 break; 1712 default : usage(); 1713 } ARGEND 1714 1715 /* no script to run */ 1716 if (!script && !argc) 1717 usage(); 1718 1719 /* no script yet, next argument is script */ 1720 if (!script) 1721 compile(*argv++, 0); 1722 1723 /* shrink/grow memory to fit and add our last instruction */ 1724 resize((void **)&prog, &pcap, sizeof(*prog), pc - prog + 1, NULL); 1725 pc = prog + pcap - 1; 1726 pc->fninfo = &(Fninfo){ cmd_last, NULL, NULL, 0 }; 1727 1728 files = argv; 1729 run(); 1730 1731 ret |= fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>"); 1732 1733 return ret; 1734 }