Patch from Kent Robotti to being fdisk in sync with v2.12 final.
[oweals/busybox.git] / editors / sed.c
1 /*
2  * sed.c - very minimalist version of sed
3  *
4  * Copyright (C) 1999,2000,2001 by Lineo, inc. and Mark Whitley
5  * Copyright (C) 1999,2000,2001 by Mark Whitley <markw@codepoet.org>
6  * Copyright (C) 2002  Matt Kraai
7  * Copyright (C) 2003 by Glenn McGrath <bug1@optushome.com.au>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  *
23  */
24
25 /*
26         Supported features and commands in this version of sed:
27
28          - comments ('#')
29          - address matching: num|/matchstr/[,num|/matchstr/|$]command
30          - commands: (p)rint, (d)elete, (s)ubstitue (with g & I flags)
31          - edit commands: (a)ppend, (i)nsert, (c)hange
32          - file commands: (r)ead
33          - backreferences in substitution expressions (\1, \2...\9)
34          - grouped commands: {cmd1;cmd2}
35          - transliteration (y/source-chars/dest-chars/)
36          - pattern space hold space storing / swapping (g, h, x)
37          - labels / branching (: label, b, t)
38
39          (Note: Specifying an address (range) to match is *optional*; commands
40          default to the whole pattern space if no specific address match was
41          requested.)
42
43         Unsupported features:
44
45          - GNU extensions
46          - and lots, lots more.
47
48         Bugs:
49         
50          - Cant subst globally using ^ or $ in regex, eg. "aah" | sed 's/^a/b/g'
51
52         Reference http://www.opengroup.org/onlinepubs/007904975/utilities/sed.html
53 */
54
55 #include <stdio.h>
56 #include <unistd.h>             /* for getopt() */
57 #include <regex.h>
58 #include <string.h>             /* for strdup() */
59 #include <errno.h>
60 #include <ctype.h>              /* for isspace() */
61 #include <stdlib.h>
62 #include "busybox.h"
63
64 typedef struct sed_cmd_s {
65         /* Order by alignment requirements */
66
67         /* address storage */
68         regex_t *beg_match;     /* sed -e '/match/cmd' */
69         regex_t *end_match;     /* sed -e '/match/,/end_match/cmd' */
70
71         int beg_line;           /* 'sed 1p'   0 == no begining line, apply commands to all lines */
72         int end_line;           /* 'sed 1,3p' 0 == no end line, use only beginning. -1 == $ */
73
74         /* inversion flag */
75         int invert;                     /* the '!' after the address */
76 //      int block_cmd;  /* This command is part of a group that has a command address */
77
78         /* SUBSTITUTION COMMAND SPECIFIC FIELDS */
79
80         /* sed -e 's/sub_match/replace/' */
81         regex_t *sub_match;
82         char *replace;
83
84         /* EDIT COMMAND (a,i,c) SPECIFIC FIELDS */
85         char *editline;
86
87         /* FILE COMMAND (r) SPECIFIC FIELDS */
88         char *filename;
89
90         /* SUBSTITUTION COMMAND SPECIFIC FIELDS */
91
92         unsigned int num_backrefs:4;    /* how many back references (\1..\9) */
93         /* Note:  GNU/POSIX sed does not save more than nine backrefs, so
94          * we only use 4 bits to hold the number */
95         unsigned int sub_g:1;   /* sed -e 's/foo/bar/g' (global) */
96         unsigned int sub_p:1;   /* sed -e 's/foo/bar/p' (print substitution) */
97
98         /* TRANSLATE COMMAND */
99         char *translate;
100
101         /* GENERAL FIELDS */
102         /* the command */
103         char cmd;                       /* p,d,s (add more at your leisure :-) */
104
105         /* Branch commands */
106         char *label;
107
108         /* next command in list (sequential list of specified commands) */
109         struct sed_cmd_s *next;
110
111 } sed_cmd_t;
112
113
114 /* externs */
115 extern void xregcomp(regex_t * preg, const char *regex, int cflags);
116 extern int optind;              /* in unistd.h */
117 extern char *optarg;    /* ditto */
118
119 /* globals */
120 /* options */
121 static int be_quiet = 0;
122 static const char bad_format_in_subst[] =
123         "bad format in substitution expression";
124
125 /* linked list of sed commands */
126 static sed_cmd_t sed_cmd_head;
127 static sed_cmd_t *sed_cmd_tail = &sed_cmd_head;
128
129 const char *const semicolon_whitespace = "; \n\r\t\v\0";
130 static regex_t *previous_regex_ptr = NULL;
131
132
133 #ifdef CONFIG_FEATURE_CLEAN_UP
134 static void destroy_cmd_strs(void)
135 {
136         sed_cmd_t *sed_cmd = sed_cmd_head.next;
137
138         while (sed_cmd) {
139                 sed_cmd_t *sed_cmd_next = sed_cmd->next;
140
141                 if (sed_cmd->beg_match) {
142                         regfree(sed_cmd->beg_match);
143                         free(sed_cmd->beg_match);
144                 }
145                 if (sed_cmd->end_match) {
146                         regfree(sed_cmd->end_match);
147                         free(sed_cmd->end_match);
148                 }
149                 if (sed_cmd->sub_match) {
150                         regfree(sed_cmd->sub_match);
151                         free(sed_cmd->sub_match);
152                 }
153                 free(sed_cmd->replace);
154                 free(sed_cmd);
155                 sed_cmd = sed_cmd_next;
156         }
157 }
158 #endif
159
160 /*
161  * index_of_next_unescaped_regexp_delim - walks left to right through a string
162  * beginning at a specified index and returns the index of the next regular
163  * expression delimiter (typically a forward * slash ('/')) not preceeded by 
164  * a backslash ('\').
165  */
166 static int index_of_next_unescaped_regexp_delim(const char delimiter,
167         const char *str)
168 {
169         int bracket = -1;
170         int escaped = 0;
171         int idx = 0;
172         char ch;
173
174         for (; (ch = str[idx]); idx++) {
175                 if (bracket != -1) {
176                         if (ch == ']' && !(bracket == idx - 1 || (bracket == idx - 2
177                                                 && str[idx - 1] == '^')))
178                                 bracket = -1;
179                 } else if (escaped)
180                         escaped = 0;
181                 else if (ch == '\\')
182                         escaped = 1;
183                 else if (ch == '[')
184                         bracket = idx;
185                 else if (ch == delimiter)
186                         return idx;
187         }
188
189         /* if we make it to here, we've hit the end of the string */
190         return -1;
191 }
192
193 static int parse_regex_delim(const char *cmdstr, char **match, char **replace)
194 {
195         const char *cmdstr_ptr = cmdstr;
196         char delimiter;
197         int idx = 0;
198
199         /* verify that the 's' is followed by something.  That something
200          * (typically a 'slash') is now our regexp delimiter... */
201         if (*cmdstr == '\0')
202                 bb_error_msg_and_die(bad_format_in_subst);
203         else
204                 delimiter = *cmdstr_ptr;
205
206         cmdstr_ptr++;
207
208         /* save the match string */
209         idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr);
210         if (idx == -1) {
211                 bb_error_msg_and_die(bad_format_in_subst);
212         }
213         *match = bb_xstrndup(cmdstr_ptr, idx);
214
215         /* save the replacement string */
216         cmdstr_ptr += idx + 1;
217         idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr);
218         if (idx == -1) {
219                 bb_error_msg_and_die(bad_format_in_subst);
220         }
221         *replace = bb_xstrndup(cmdstr_ptr, idx);
222
223         return ((cmdstr_ptr - cmdstr) + idx);
224 }
225
226 /*
227  * returns the index in the string just past where the address ends.
228  */
229 static int get_address(char *my_str, int *linenum, regex_t ** regex)
230 {
231         int idx = 0;
232
233         if (isdigit(my_str[idx])) {
234                 char *endstr;
235
236                 *linenum = strtol(my_str, &endstr, 10);
237                 /* endstr shouldnt ever equal NULL */
238                 idx = endstr - my_str;
239         } else if (my_str[idx] == '$') {
240                 *linenum = -1;
241                 idx++;
242         } else if (my_str[idx] == '/' || my_str[idx] == '\\') {
243                 int idx_start = 1;
244                 char delimiter;
245
246                 delimiter = '/';
247                 if (my_str[idx] == '\\') {
248                         idx_start++;
249                         delimiter = my_str[++idx];
250                 }
251                 idx++;
252                 idx += index_of_next_unescaped_regexp_delim(delimiter, my_str + idx);
253                 if (idx == -1) {
254                         bb_error_msg_and_die("unterminated match expression");
255                 }
256                 my_str[idx] = '\0';
257
258                 *regex = (regex_t *) xmalloc(sizeof(regex_t));
259                 xregcomp(*regex, my_str + idx_start, REG_NEWLINE);
260                 idx++;                  /* so it points to the next character after the last '/' */
261         }
262         return idx;
263 }
264
265 static int parse_subst_cmd(sed_cmd_t * const sed_cmd, const char *substr)
266 {
267         int cflags = 0;
268         char *match;
269         int idx = 0;
270         int j;
271
272         /*
273          * the string that gets passed to this function should look like this:
274          *    s/match/replace/gIp
275          *    ||     |        |||
276          *    mandatory       optional
277          *
278          *    (all three of the '/' slashes are mandatory)
279          */
280         idx = parse_regex_delim(substr, &match, &sed_cmd->replace);
281
282         /* determine the number of back references in the match string */
283         /* Note: we compute this here rather than in the do_subst_command()
284          * function to save processor time, at the expense of a little more memory
285          * (4 bits) per sed_cmd */
286
287         /* sed_cmd->num_backrefs = 0; *//* XXX: not needed? --apparently not */
288         for (j = 0; match[j]; j++) {
289                 /* GNU/POSIX sed does not save more than nine backrefs */
290                 if (match[j] == '\\' && match[j + 1] == '('
291                         && sed_cmd->num_backrefs <= 9)
292                         sed_cmd->num_backrefs++;
293         }
294
295         /* process the flags */
296         while (substr[++idx]) {
297                 switch (substr[idx]) {
298                 case 'g':
299                         sed_cmd->sub_g = 1;
300                         break;
301                         /* Hmm, i dont see the I option mentioned in the standard */
302                 case 'I':
303                         cflags |= REG_ICASE;
304                         break;
305                 case 'p':
306                         sed_cmd->sub_p = 1;
307                         break;
308                 default:
309                         /* any whitespace or semicolon trailing after a s/// is ok */
310                         if (strchr(semicolon_whitespace, substr[idx]))
311                                 goto out;
312                         /* else */
313                         bb_error_msg_and_die("bad option in substitution expression");
314                 }
315         }
316
317   out:
318         /* compile the match string into a regex */
319         if (*match != '\0') {
320                 /* If match is empty, we use last regex used at runtime */
321                 sed_cmd->sub_match = (regex_t *) xmalloc(sizeof(regex_t));
322                 xregcomp(sed_cmd->sub_match, match, cflags);
323         }
324         free(match);
325
326         return idx;
327 }
328
329 static void replace_slash_n(char *string)
330 {
331         int i;
332         int remaining = strlen(string);
333
334         for (i = 0; string[i]; i++) {
335                 if ((string[i] == '\\') && (string[i + 1] == 'n')) {
336                         string[i] = '\n';
337                         memmove(string + i + 1, string + i + 1, remaining - 1);
338                 } else {
339                         remaining--;
340                 }
341         }
342 }
343
344 static int parse_translate_cmd(sed_cmd_t * const sed_cmd, const char *cmdstr)
345 {
346         char *match;
347         char *replace;
348         int idx;
349         int i;
350
351         idx = parse_regex_delim(cmdstr, &match, &replace);
352         replace_slash_n(match);
353         replace_slash_n(replace);
354         sed_cmd->translate = xcalloc(1, (strlen(match) + 1) * 2);
355         for (i = 0; (match[i] != 0) && (replace[i] != 0); i++) {
356                 sed_cmd->translate[i * 2] = match[i];
357                 sed_cmd->translate[(i * 2) + 1] = replace[i];
358         }
359         return (idx + 1);
360 }
361
362 static int parse_edit_cmd(sed_cmd_t * sed_cmd, const char *editstr)
363 {
364         int i, j;
365
366         /*
367          * the string that gets passed to this function should look like this:
368          *
369          *    need one of these 
370          *    |
371          *    |    this backslash (immediately following the edit command) is mandatory
372          *    |    |
373          *    [aic]\
374          *    TEXT1\
375          *    TEXT2\
376          *    TEXTN
377          *
378          * as soon as we hit a TEXT line that has no trailing '\', we're done.
379          * this means a command like:
380          *
381          * i\
382          * INSERTME
383          *
384          * is a-ok.
385          *
386          */
387         if ((*editstr != '\\') || ((editstr[1] != '\n') && (editstr[1] != '\r'))) {
388                 bb_error_msg_and_die("bad format in edit expression");
389         }
390
391         /* store the edit line text */
392         sed_cmd->editline = xmalloc(strlen(&editstr[2]) + 2);
393         for (i = 2, j = 0;
394                 editstr[i] != '\0' && strchr("\r\n", editstr[i]) == NULL; i++, j++) {
395                 if ((editstr[i] == '\\') && strchr("\n\r", editstr[i + 1]) != NULL) {
396                         sed_cmd->editline[j] = '\n';
397                         i++;
398                 } else
399                         sed_cmd->editline[j] = editstr[i];
400         }
401
402         /* figure out if we need to add a newline */
403         if (sed_cmd->editline[j - 1] != '\n')
404                 sed_cmd->editline[j++] = '\n';
405
406         /* terminate string */
407         sed_cmd->editline[j] = '\0';
408
409         return i;
410 }
411
412
413 static int parse_file_cmd(sed_cmd_t * sed_cmd, const char *filecmdstr)
414 {
415         int idx = 0;
416         int filenamelen = 0;
417
418         /*
419          * the string that gets passed to this function should look like this:
420          *    '[ ]filename'
421          *      |  |
422          *      |  a filename
423          *      |
424          *     optional whitespace
425
426          *   re: the file to be read, the GNU manual says the following: "Note that
427          *   if filename cannot be read, it is treated as if it were an empty file,
428          *   without any error indication." Thus, all of the following commands are
429          *   perfectly leagal:
430          *
431          *   sed -e '1r noexist'
432          *   sed -e '1r ;'
433          *   sed -e '1r'
434          */
435
436         /* the file command may be followed by whitespace; move past it. */
437         while (isspace(filecmdstr[++idx])) {;
438         }
439
440         /* the first non-whitespace we get is a filename. the filename ends when we
441          * hit a normal sed command terminator or end of string */
442         filenamelen = strcspn(&filecmdstr[idx], semicolon_whitespace);
443         sed_cmd->filename = xmalloc(filenamelen + 1);
444         safe_strncpy(sed_cmd->filename, &filecmdstr[idx], filenamelen + 1);
445         return idx + filenamelen;
446 }
447
448 /*
449  *  Process the commands arguments
450  */
451 static char *parse_cmd_str(sed_cmd_t * sed_cmd, char *cmdstr)
452 {
453         /* handle (s)ubstitution command */
454         if (sed_cmd->cmd == 's') {
455                 cmdstr += parse_subst_cmd(sed_cmd, cmdstr);
456         }
457         /* handle edit cmds: (a)ppend, (i)nsert, and (c)hange */
458         else if (strchr("aic", sed_cmd->cmd)) {
459                 if ((sed_cmd->end_line || sed_cmd->end_match) && sed_cmd->cmd != 'c')
460                         bb_error_msg_and_die
461                                 ("only a beginning address can be specified for edit commands");
462                 cmdstr += parse_edit_cmd(sed_cmd, cmdstr);
463         }
464         /* handle file cmds: (r)ead */
465         else if (sed_cmd->cmd == 'r') {
466                 if (sed_cmd->end_line || sed_cmd->end_match)
467                         bb_error_msg_and_die("Command only uses one address");
468                 cmdstr += parse_file_cmd(sed_cmd, cmdstr);
469         }
470         /* handle branch commands */
471         else if (strchr(":bt", sed_cmd->cmd)) {
472                 int length;
473
474                 cmdstr += strspn(cmdstr, " ");
475                 length = strcspn(cmdstr, "; \n");
476                 sed_cmd->label = strndup(cmdstr, length);
477                 cmdstr += length;
478         }
479         /* translation command */
480         else if (sed_cmd->cmd == 'y') {
481                 cmdstr += parse_translate_cmd(sed_cmd, cmdstr);
482         }
483         /* if it wasnt a single-letter command that takes no arguments
484          * then it must be an invalid command.
485          */
486         else if (strchr("dgGhHnNpPqx={}", sed_cmd->cmd) == 0) {
487                 bb_error_msg_and_die("Unsupported command %c", sed_cmd->cmd);
488         }
489
490         /* give back whatever's left over */
491         return (cmdstr);
492 }
493
494 static char *add_cmd(sed_cmd_t * sed_cmd, char *cmdstr)
495 {
496         /* Skip over leading whitespace and semicolons */
497         cmdstr += strspn(cmdstr, semicolon_whitespace);
498
499         /* if we ate the whole thing, that means there was just trailing
500          * whitespace or a final / no-op semicolon. either way, get out */
501         if (*cmdstr == '\0') {
502                 return (NULL);
503         }
504
505         /* if this is a comment, jump past it and keep going */
506         if (*cmdstr == '#') {
507                 /* "#n" is the same as using -n on the command line */
508                 if (cmdstr[1] == 'n') {
509                         be_quiet++;
510                 }
511                 return (strpbrk(cmdstr, "\n\r"));
512         }
513
514         /* parse the command
515          * format is: [addr][,addr]cmd
516          *            |----||-----||-|
517          *            part1 part2  part3
518          */
519
520         /* first part (if present) is an address: either a '$', a number or a /regex/ */
521         cmdstr += get_address(cmdstr, &sed_cmd->beg_line, &sed_cmd->beg_match);
522
523         /* second part (if present) will begin with a comma */
524         if (*cmdstr == ',') {
525                 int idx;
526
527                 cmdstr++;
528                 idx = get_address(cmdstr, &sed_cmd->end_line, &sed_cmd->end_match);
529                 if (idx == 0) {
530                         bb_error_msg_and_die("get_address: no address found in string\n"
531                                 "\t(you probably didn't check the string you passed me)");
532                 }
533                 cmdstr += idx;
534         }
535
536         /* skip whitespace before the command */
537         while (isspace(*cmdstr)) {
538                 cmdstr++;
539         }
540
541         /* there my be the inversion flag between part2 and part3 */
542         if (*cmdstr == '!') {
543                 sed_cmd->invert = 1;
544                 cmdstr++;
545
546 #ifdef SED_FEATURE_STRICT_CHECKING
547                 /* According to the spec
548                  * It is unspecified whether <blank>s can follow a '!' character,
549                  * and conforming applications shall not follow a '!' character
550                  * with <blank>s.
551                  */
552                 if (isblank(cmdstr[idx]) {
553                         bb_error_msg_and_die("blank follows '!'");}
554 #else
555                 /* skip whitespace before the command */
556                 while (isspace(*cmdstr)) {
557                         cmdstr++;
558                 }
559 #endif
560         }
561
562         /* last part (mandatory) will be a command */
563         if (*cmdstr == '\0')
564                 bb_error_msg_and_die("missing command");
565
566         sed_cmd->cmd = *cmdstr;
567         cmdstr++;
568
569         cmdstr = parse_cmd_str(sed_cmd, cmdstr);
570
571         /* Add the command to the command array */
572         sed_cmd_tail->next = sed_cmd;
573         sed_cmd_tail = sed_cmd_tail->next;
574
575         return (cmdstr);
576 }
577
578 static void add_cmd_str(char *cmdstr)
579 {
580 #ifdef CONFIG_FEATURE_SED_EMBEDED_NEWLINE
581         char *cmdstr_ptr = cmdstr;
582
583         /* HACK: convert "\n" to match tranlated '\n' string */
584         while ((cmdstr_ptr = strstr(cmdstr_ptr, "\\n")) != NULL) {
585                 cmdstr = xrealloc(cmdstr, strlen(cmdstr) + 2);
586                 cmdstr_ptr = strstr(cmdstr, "\\n");
587                 memmove(cmdstr_ptr + 1, cmdstr_ptr, strlen(cmdstr_ptr) + 1);
588                 cmdstr_ptr[0] = '\\';
589                 cmdstr_ptr += 3;
590         }
591 #endif
592         do {
593                 sed_cmd_t *sed_cmd;
594
595                 sed_cmd = xcalloc(1, sizeof(sed_cmd_t));
596                 cmdstr = add_cmd(sed_cmd, cmdstr);
597         } while (cmdstr && strlen(cmdstr));
598 }
599
600
601 static void load_cmd_file(char *filename)
602 {
603         FILE *cmdfile;
604         char *line;
605         char *nextline;
606         char *e;
607
608         cmdfile = bb_xfopen(filename, "r");
609
610         while ((line = bb_get_line_from_file(cmdfile)) != NULL) {
611                 /* if a line ends with '\' it needs the next line appended to it */
612                 while (((e = last_char_is(line, '\n')) != NULL)
613                         && (e > line) && (e[-1] == '\\')
614                         && ((nextline = bb_get_line_from_file(cmdfile)) != NULL)) {
615                         line = xrealloc(line, (e - line) + 1 + strlen(nextline) + 1);
616                         strcat(line, nextline);
617                         free(nextline);
618                 }
619                 /* eat trailing newline (if any) --if I don't do this, edit commands
620                  * (aic) will print an extra newline */
621                 chomp(line);
622                 add_cmd_str(line);
623                 free(line);
624         }
625 }
626
627 struct pipeline {
628         char *buf;
629         int idx;
630         int len;
631 };
632
633 #define PIPE_MAGIC 0x7f
634 #define PIPE_GROW 64
635
636 void pipe_putc(struct pipeline *const pipeline, char c)
637 {
638         if (pipeline->buf[pipeline->idx] == PIPE_MAGIC) {
639                 pipeline->buf = xrealloc(pipeline->buf, pipeline->len + PIPE_GROW);
640                 memset(pipeline->buf + pipeline->len, 0, PIPE_GROW);
641                 pipeline->len += PIPE_GROW;
642                 pipeline->buf[pipeline->len - 1] = PIPE_MAGIC;
643         }
644         pipeline->buf[pipeline->idx++] = (c);
645 }
646
647 #define pipeputc(c)     pipe_putc(pipeline, c)
648
649 #if 0
650 {
651         if (pipeline[pipeline_idx] == PIPE_MAGIC) {
652                 pipeline = xrealloc(pipeline, pipeline_len + PIPE_GROW);
653                 memset(pipeline + pipeline_len, 0, PIPE_GROW);
654                 pipeline_len += PIPE_GROW;
655                 pipeline[pipeline_len - 1] = PIPE_MAGIC;
656         }
657         pipeline[pipeline_idx++] = (c);
658 }
659 #endif
660
661 static void print_subst_w_backrefs(const char *line, const char *replace,
662         regmatch_t * regmatch, struct pipeline *const pipeline, int matches)
663 {
664         int i;
665
666         /* go through the replacement string */
667         for (i = 0; replace[i]; i++) {
668                 /* if we find a backreference (\1, \2, etc.) print the backref'ed * text */
669                 if (replace[i] == '\\' && isdigit(replace[i + 1])) {
670                         int j;
671                         char tmpstr[2];
672                         int backref;
673
674                         ++i;            /* i now indexes the backref number, instead of the leading slash */
675                         tmpstr[0] = replace[i];
676                         tmpstr[1] = 0;
677                         backref = atoi(tmpstr);
678                         /* print out the text held in regmatch[backref] */
679                         if (backref <= matches && regmatch[backref].rm_so != -1)
680                                 for (j = regmatch[backref].rm_so; j < regmatch[backref].rm_eo;
681                                         j++)
682                                         pipeputc(line[j]);
683                 }
684
685                 /* if we find a backslash escaped character, print the character */
686                 else if (replace[i] == '\\') {
687                         ++i;
688                         pipeputc(replace[i]);
689                 }
690
691                 /* if we find an unescaped '&' print out the whole matched text.
692                  * fortunately, regmatch[0] contains the indicies to the whole matched
693                  * expression (kinda seems like it was designed for just such a
694                  * purpose...) */
695                 else if (replace[i] == '&') {
696                         int j;
697
698                         for (j = regmatch[0].rm_so; j < regmatch[0].rm_eo; j++)
699                                 pipeputc(line[j]);
700                 }
701                 /* nothing special, just print this char of the replacement string to stdout */
702                 else
703                         pipeputc(replace[i]);
704         }
705 }
706
707 static int do_subst_command(sed_cmd_t * sed_cmd, char **line)
708 {
709         char *hackline = *line;
710         struct pipeline thepipe = { NULL, 0, 0 };
711         struct pipeline *const pipeline = &thepipe;
712         int altered = 0;
713         int result;
714         regmatch_t *regmatch = NULL;
715         regex_t *current_regex;
716
717         if (sed_cmd->sub_match == NULL) {
718                 current_regex = previous_regex_ptr;
719         } else {
720                 previous_regex_ptr = current_regex = sed_cmd->sub_match;
721         }
722         result = regexec(current_regex, hackline, 0, NULL, 0);
723
724         /* we only proceed if the substitution 'search' expression matches */
725         if (result == REG_NOMATCH) {
726                 return 0;
727         }
728
729         /* whaddaya know, it matched. get the number of back references */
730         regmatch = xmalloc(sizeof(regmatch_t) * (sed_cmd->num_backrefs + 1));
731
732         /* allocate more PIPE_GROW bytes
733            if replaced string is larger than original */
734         thepipe.len = strlen(hackline) + PIPE_GROW;
735         thepipe.buf = xcalloc(1, thepipe.len);
736         /* buffer magic */
737         thepipe.buf[thepipe.len - 1] = PIPE_MAGIC;
738
739         /* and now, as long as we've got a line to try matching and if we can match
740          * the search string, we make substitutions */
741         while ((*hackline || !altered)
742                 && (regexec(current_regex, hackline, sed_cmd->num_backrefs + 1,
743                                 regmatch, 0) != REG_NOMATCH)) {
744                 int i;
745
746                 /* print everything before the match */
747                 for (i = 0; i < regmatch[0].rm_so; i++)
748                         pipeputc(hackline[i]);
749
750                 /* then print the substitution string */
751                 print_subst_w_backrefs(hackline, sed_cmd->replace, regmatch, pipeline,
752                         sed_cmd->num_backrefs);
753
754                 /* advance past the match */
755                 hackline += regmatch[0].rm_eo;
756                 /* flag that something has changed */
757                 altered++;
758
759                 /* if we're not doing this globally, get out now */
760                 if (!sed_cmd->sub_g) {
761                         break;
762                 }
763         }
764         for (; *hackline; hackline++)
765                 pipeputc(*hackline);
766         if (thepipe.buf[thepipe.idx] == PIPE_MAGIC)
767                 thepipe.buf[thepipe.idx] = 0;
768
769         /* cleanup */
770         free(regmatch);
771
772         free(*line);
773         *line = thepipe.buf;
774         return altered;
775 }
776
777 static sed_cmd_t *branch_to(const char *label)
778 {
779         sed_cmd_t *sed_cmd;
780
781         for (sed_cmd = sed_cmd_head.next; sed_cmd; sed_cmd = sed_cmd->next) {
782                 if ((sed_cmd->label) && (strcmp(sed_cmd->label, label) == 0)) {
783                         break;
784                 }
785         }
786
787         /* If no match returns last command */
788         return (sed_cmd);
789 }
790
791 static void process_file(FILE * file)
792 {
793         char *pattern_space;    /* Posix requires it be able to hold at least 8192 bytes */
794         char *hold_space = NULL;        /* Posix requires it be able to hold at least 8192 bytes */
795         static int linenum = 0; /* GNU sed does not restart counting lines at EOF */
796         unsigned int still_in_range = 0;
797         int altered;
798         int force_print;
799
800         pattern_space = bb_get_chomped_line_from_file(file);
801         if (pattern_space == NULL) {
802                 return;
803         }
804  
805         /* go through every line in the file */
806         do {
807                 char *next_line;
808                 sed_cmd_t *sed_cmd;
809                 int substituted = 0;
810                 /* This enables whole blocks of commands to be mask'ed out if the lead address doesnt match */
811                 int block_mask = 1;
812
813                 /* Read one line in advance so we can act on the last line, the '$' address */
814                 next_line = bb_get_chomped_line_from_file(file);
815
816                 linenum++;
817                 altered = 0;
818                 force_print = 0;
819
820                 /* for every line, go through all the commands */
821                 for (sed_cmd = sed_cmd_head.next; sed_cmd; sed_cmd = sed_cmd->next) {
822                         int deleted = 0;
823
824                         /*
825                          * entry point into sedding...
826                          */
827                         int matched = (
828                                 /* no range necessary */
829                                 (sed_cmd->beg_line == 0 && sed_cmd->end_line == 0
830                                         && sed_cmd->beg_match == NULL
831                                         && sed_cmd->end_match == NULL) ||
832                                 /* this line number is the first address we're looking for */
833                                 (sed_cmd->beg_line > 0 && (sed_cmd->beg_line == linenum)) ||
834                                 /* this line matches our first address regex */
835                                 (sed_cmd->beg_match
836                                         && (regexec(sed_cmd->beg_match, pattern_space, 0, NULL,
837                                                         0) == 0)) ||
838                                 /* we are currently within the beginning & ending address range */
839                                 still_in_range || ((sed_cmd->beg_line == -1)
840                                         && (next_line == NULL))
841                                 );
842
843                         if (sed_cmd->cmd == '{') {
844                                 block_mask = block_mask & matched;
845                         }
846 //                      matched &= block_mask;
847
848                         if (sed_cmd->invert ^ (matched & block_mask)) {
849                                 /* Update last used regex incase a blank substitute BRE is found */
850                                 if (sed_cmd->beg_match) {
851                                         previous_regex_ptr = sed_cmd->beg_match;
852                                 }
853
854                                 /*
855                                  * actual sedding
856                                  */
857                                 switch (sed_cmd->cmd) {
858                                 case '=':
859                                         printf("%d\n", linenum);
860                                         break;
861                                 case 'P':{
862                                         /* Write the current pattern space upto the first newline */
863                                         char *tmp = strchr(pattern_space, '\n');
864
865                                         if (tmp) {
866                                                 *tmp = '\0';
867                                         }
868                                 }
869                                 case 'p':       /* Write the current pattern space to output */
870                                         puts(pattern_space);
871                                         break;
872                                 case 'd':
873                                         altered++;
874                                         deleted = 1;
875                                         break;
876
877                                 case 's':
878
879                                         /*
880                                          * Some special cases for 's' printing to make it compliant with
881                                          * GNU sed printing behavior (aka "The -n | s///p Matrix"):
882                                          *
883                                          *    -n ONLY = never print anything regardless of any successful
884                                          *    substitution
885                                          *
886                                          *    s///p ONLY = always print successful substitutions, even if
887                                          *    the pattern_space is going to be printed anyway (pattern_space
888                                          *    will be printed twice).
889                                          *
890                                          *    -n AND s///p = print ONLY a successful substitution ONE TIME;
891                                          *    no other lines are printed - this is the reason why the 'p'
892                                          *    flag exists in the first place.
893                                          */
894
895 #ifdef CONFIG_FEATURE_SED_EMBEDED_NEWLINE
896                                         /* HACK: escape newlines twice so regex can match them */
897                                 {
898                                         int offset = 0;
899
900                                         while (strchr(pattern_space + offset, '\n') != NULL) {
901                                                 char *tmp;
902
903                                                 pattern_space =
904                                                         xrealloc(pattern_space,
905                                                         strlen(pattern_space) + 2);
906                                                 tmp = strchr(pattern_space + offset, '\n');
907                                                 memmove(tmp + 1, tmp, strlen(tmp) + 1);
908                                                 tmp[0] = '\\';
909                                                 tmp[1] = 'n';
910                                                 offset = tmp - pattern_space + 2;
911                                         }
912                                 }
913 #endif
914                                         /* we print the pattern_space once, unless we were told to be quiet */
915                                         substituted = do_subst_command(sed_cmd, &pattern_space);
916
917 #ifdef CONFIG_FEATURE_SED_EMBEDED_NEWLINE
918                                         /* undo HACK: escape newlines twice so regex can match them */
919                                         {
920                                                 char *tmp = pattern_space;
921
922                                                 while ((tmp = strstr(tmp, "\\n")) != NULL) {
923                                                         memmove(tmp, tmp + 1, strlen(tmp + 1) + 1);
924                                                         tmp[0] = '\n';
925                                                 }
926                                         }
927 #endif
928                                         altered |= substituted;
929                                         if (!be_quiet && altered && ((sed_cmd->next == NULL)
930                                                         || (sed_cmd->next->cmd != 's'))) {
931                                                 force_print = 1;
932                                         }
933
934                                         /* we also print the line if we were given the 'p' flag
935                                          * (this is quite possibly the second printing) */
936                                         if ((sed_cmd->sub_p) && altered) {
937                                                 puts(pattern_space);
938                                         }
939                                         break;
940                                 case 'a':
941                                         puts(pattern_space);
942                                         fputs(sed_cmd->editline, stdout);
943                                         altered++;
944                                         break;
945
946                                 case 'i':
947                                         fputs(sed_cmd->editline, stdout);
948                                         break;
949
950                                 case 'c':
951                                         /* single-address case */
952                                         if ((sed_cmd->end_match == NULL && sed_cmd->end_line == 0)
953                                                 /* multi-address case */
954                                                 /* - matching text */
955                                                 || (sed_cmd->end_match
956                                                         && (regexec(sed_cmd->end_match, pattern_space, 0,
957                                                                         NULL, 0) == 0))
958                                                 /* - matching line numbers */
959                                                 || (sed_cmd->end_line > 0
960                                                         && sed_cmd->end_line == linenum)) {
961                                                 fputs(sed_cmd->editline, stdout);
962                                         }
963                                         altered++;
964
965                                         break;
966
967                                 case 'r':{
968                                         FILE *outfile;
969
970                                         outfile = fopen(sed_cmd->filename, "r");
971                                         if (outfile) {
972                                                 char *line;
973
974                                                 while ((line =
975                                                                 bb_get_chomped_line_from_file(outfile)) !=
976                                                         NULL) {
977                                                         pattern_space =
978                                                                 xrealloc(pattern_space,
979                                                                 strlen(line) + strlen(pattern_space) + 2);
980                                                         strcat(pattern_space, "\n");
981                                                         strcat(pattern_space, line);
982                                                 }
983                                                 bb_xprint_and_close_file(outfile);
984                                         }
985
986                                 }
987                                         break;
988                                 case 'q':       /* Branch to end of script and quit */
989                                         deleted = 1;
990                                         /* Exit the outer while loop */
991                                         free(next_line);
992                                         next_line = NULL;
993                                         break;
994                                 case 'n':       /* Read next line from input */
995                                         free(pattern_space);
996                                         pattern_space = next_line;
997                                         next_line = bb_get_chomped_line_from_file(file);
998                                         linenum++;
999                                         break;
1000                                 case 'N':       /* Append the next line to the current line */
1001                                         if (next_line) {
1002                                                 pattern_space =
1003                                                         realloc(pattern_space,
1004                                                         strlen(pattern_space) + strlen(next_line) + 2);
1005                                                 strcat(pattern_space, "\n");
1006                                                 strcat(pattern_space, next_line);
1007                                                 next_line = bb_get_chomped_line_from_file(file);
1008                                                 linenum++;
1009                                         }
1010                                         break;
1011                                 case 'b':
1012                                         sed_cmd = branch_to(sed_cmd->label);
1013                                         break;
1014                                 case 't':
1015                                         if (substituted) {
1016                                                 sed_cmd = branch_to(sed_cmd->label);
1017                                         }
1018                                         break;
1019                                 case 'y':{
1020                                         int i;
1021
1022                                         for (i = 0; pattern_space[i] != 0; i++) {
1023                                                 int j;
1024
1025                                                 for (j = 0; sed_cmd->translate[j]; j += 2) {
1026                                                         if (pattern_space[i] == sed_cmd->translate[j]) {
1027                                                                 pattern_space[i] = sed_cmd->translate[j + 1];
1028                                                         }
1029                                                 }
1030                                         }
1031                                 }
1032                                         break;
1033                                 case 'g':       /* Replace pattern space with hold space */
1034                                         free(pattern_space);
1035                                         pattern_space = strdup(hold_space);
1036                                         break;
1037                                 case 'G': {     /* Append newline and hold space to pattern space */
1038                                         int pattern_space_size = 0;
1039                                         if (pattern_space) {
1040                                                 pattern_space_size = strlen(pattern_space);
1041                                         }
1042                                         pattern_space = xrealloc(pattern_space, pattern_space_size + strlen(hold_space) + 2);
1043                                         strcat(pattern_space, "\n");
1044                                         strcat(pattern_space, hold_space); 
1045                                         break;
1046                                 }
1047                                 case 'h':       /* Replace hold space with pattern space */
1048                                         free(hold_space);
1049                                         hold_space = strdup(pattern_space);
1050                                         break;
1051                                 case 'H': {     /* Append newline and pattern space to hold space */
1052                                         int hold_space_size = 0;
1053                                         if (hold_space) {
1054                                                 hold_space_size = strlen(hold_space);
1055                                         }
1056                                         hold_space = xrealloc(hold_space, hold_space_size + strlen(pattern_space) + 2);
1057                                         strcat(hold_space, "\n");
1058                                         strcat(hold_space, pattern_space); 
1059                                         break;
1060                                 }
1061                                 case 'x':{
1062                                         /* Swap hold and pattern space */
1063                                         char *tmp = pattern_space;
1064                                         pattern_space = hold_space;
1065                                         hold_space = tmp;
1066                                         break;
1067                                 }
1068                                 }
1069                         }
1070
1071                         /*
1072                          * exit point from sedding...
1073                          */
1074                         if (matched) {
1075                                 if (
1076                                         /* this is a single-address command or... */
1077                                         (sed_cmd->end_line == 0 && sed_cmd->end_match == NULL)
1078                                         /* If only one address */
1079                                         /* we were in the middle of our address range (this
1080                                          * isn't the first time through) and.. */
1081                                         || ((still_in_range == 1)
1082                                                 /* this line number is the last address we're looking for or... */
1083                                                 && ((sed_cmd->end_line > 0
1084                                                                 && (sed_cmd->end_line == linenum))
1085                                                         /* this line matches our last address regex */
1086                                                         || (sed_cmd->end_match
1087                                                                 && (regexec(sed_cmd->end_match, pattern_space,
1088                                                                                 0, NULL, 0) == 0))))) {
1089                                         /* we're out of our address range */
1090                                         still_in_range = 0;
1091                                 } else {
1092                                         /* didn't hit the exit? then we're still in the middle of an address range */
1093                                         still_in_range = 1;
1094                                 }
1095                         }
1096
1097                         if (sed_cmd->cmd == '}') {
1098                                 block_mask = 1;
1099                         }
1100
1101                         if (deleted)
1102                                 break;
1103                 }
1104
1105                 /* we will print the line unless we were told to be quiet or if the
1106                  * line was altered (via a 'd'elete or 's'ubstitution), in which case
1107                  * the altered line was already printed */
1108                 if ((!be_quiet && !altered) || force_print) {
1109                         puts(pattern_space);
1110                 }
1111                 free(pattern_space);
1112                 pattern_space = next_line;
1113         } while (pattern_space);
1114 }
1115
1116 extern int sed_main(int argc, char **argv)
1117 {
1118         int opt, status = EXIT_SUCCESS;
1119
1120 #ifdef CONFIG_FEATURE_CLEAN_UP
1121         /* destroy command strings on exit */
1122         if (atexit(destroy_cmd_strs) == -1)
1123                 bb_perror_msg_and_die("atexit");
1124 #endif
1125
1126         /* do normal option parsing */
1127         while ((opt = getopt(argc, argv, "ne:f:")) > 0) {
1128                 switch (opt) {
1129                 case 'n':
1130                         be_quiet++;
1131                         break;
1132                 case 'e':{
1133                         char *str_cmd = strdup(optarg);
1134
1135                         add_cmd_str(str_cmd);
1136                         free(str_cmd);
1137                         break;
1138                 }
1139                 case 'f':
1140                         load_cmd_file(optarg);
1141                         break;
1142                 default:
1143                         bb_show_usage();
1144                 }
1145         }
1146
1147         /* if we didn't get a pattern from a -e and no command file was specified,
1148          * argv[optind] should be the pattern. no pattern, no worky */
1149         if (sed_cmd_head.next == NULL) {
1150                 if (argv[optind] == NULL)
1151                         bb_show_usage();
1152                 else {
1153                         char *str_cmd = strdup(argv[optind]);
1154
1155                         add_cmd_str(strdup(str_cmd));
1156                         free(str_cmd);
1157                         optind++;
1158                 }
1159         }
1160
1161         /* argv[(optind)..(argc-1)] should be names of file to process. If no
1162          * files were specified or '-' was specified, take input from stdin.
1163          * Otherwise, we process all the files specified. */
1164         if (argv[optind] == NULL || (strcmp(argv[optind], "-") == 0)) {
1165                 process_file(stdin);
1166         } else {
1167                 int i;
1168                 FILE *file;
1169
1170                 for (i = optind; i < argc; i++) {
1171                         file = bb_wfopen(argv[i], "r");
1172                         if (file) {
1173                                 process_file(file);
1174                                 fclose(file);
1175                         } else
1176                                 status = EXIT_FAILURE;
1177                 }
1178         }
1179
1180         return status;
1181 }