834f06638fb253a232003752d725390aec0c231e
[oweals/busybox.git] / editors / sed.c
1 /*
2  * sed.c - very minimalist version of sed
3  *
4  * Copyright (C) 1999,2000,2001 by Lineo, inc. and Mark Whitley
5  * Copyright (C) 1999,2000,2001 by Mark Whitley <markw@codepoet.org>
6  * Copyright (C) 2002  Matt Kraai
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21  *
22  */
23
24 /*
25         Supported features and commands in this version of sed:
26
27          - comments ('#')
28          - address matching: num|/matchstr/[,num|/matchstr/|$]command
29          - commands: (p)rint, (d)elete, (s)ubstitue (with g & I flags)
30          - edit commands: (a)ppend, (i)nsert, (c)hange
31          - file commands: (r)ead
32          - backreferences in substitution expressions (\1, \2...\9)
33          - grouped commands: {cmd1;cmd2}
34
35          (Note: Specifying an address (range) to match is *optional*; commands
36          default to the whole pattern space if no specific address match was
37          requested.)
38
39         Unsupported features:
40
41          - transliteration (y/source-chars/dest-chars/) (use 'tr')
42          - no pattern space hold space storing / swapping (x, etc.)
43          - no labels / branching (: label, b, t, and friends)
44          - and lots, lots more.
45
46         Reference http://www.opengroup.org/onlinepubs/007904975/utilities/sed.html
47 */
48
49 #include <stdio.h>
50 #include <unistd.h> /* for getopt() */
51 #include <regex.h>
52 #include <string.h> /* for strdup() */
53 #include <errno.h>
54 #include <ctype.h> /* for isspace() */
55 #include <stdlib.h>
56 #include "busybox.h"
57
58 /* the spec says label must be at least 8 chars, behavious is unspecified if more than 8 chars */
59 #define SED_LABEL_LENGTH        8
60
61 /* externs */
62 extern void xregcomp(regex_t *preg, const char *regex, int cflags);
63 extern int optind; /* in unistd.h */
64 extern char *optarg; /* ditto */
65
66 /* options */
67 static int be_quiet = 0;
68 static const char bad_format_in_subst[] = "bad format in substitution expression";
69
70 typedef struct sed_cmd_s {
71         /* Order by alignment requirements */
72
73         /* address storage */
74         regex_t *beg_match; /* sed -e '/match/cmd' */
75         regex_t *end_match; /* sed -e '/match/,/end_match/cmd' */
76
77         /* SUBSTITUTION COMMAND SPECIFIC FIELDS */
78
79         /* sed -e 's/sub_match/replace/' */
80         regex_t *sub_match;
81         char *replace;
82
83         /* EDIT COMMAND (a,i,c) SPECIFIC FIELDS */
84         char *editline;
85
86         /* FILE COMMAND (r) SPECIFIC FIELDS */
87         char *filename;
88
89         /* address storage */
90         int beg_line; /* 'sed 1p'   0 == no begining line, apply commands to all lines */
91         int end_line; /* 'sed 1,3p' 0 == no end line, use only beginning. -1 == $ */
92         /* SUBSTITUTION COMMAND SPECIFIC FIELDS */
93
94         unsigned int num_backrefs:4; /* how many back references (\1..\9) */
95                         /* Note:  GNU/POSIX sed does not save more than nine backrefs, so
96                          * we only use 4 bits to hold the number */
97         unsigned int sub_g:1; /* sed -e 's/foo/bar/g' (global) */
98         unsigned int sub_p:2; /* sed -e 's/foo/bar/p' (print substitution) */
99
100         /* TRANSLATE COMMAND */
101         char *translate;
102
103         /* GENERAL FIELDS */
104         /* the command */
105         char cmd; /* p,d,s (add more at your leisure :-) */
106
107         /* inversion flag */
108         int invert;         /* the '!' after the address */
109
110         /* Branch commands */
111         char label[SED_LABEL_LENGTH + 1];
112
113         /* next command in list (sequential list of specified commands) */
114         struct sed_cmd_s *linear;
115
116 } sed_cmd_t;
117
118 /* globals */
119 /* linked list of sed commands */
120 static sed_cmd_t sed_cmd_head;
121 static sed_cmd_t *sed_cmd_tail = &sed_cmd_head;
122
123 const char * const semicolon_whitespace = "; \n\r\t\v\0";
124
125 #ifdef CONFIG_FEATURE_CLEAN_UP
126 static void destroy_cmd_strs(void)
127 {
128         sed_cmd_t *sed_cmd = sed_cmd_head.linear;
129
130         while (sed_cmd) {
131                 sed_cmd_t *sed_cmd_next = sed_cmd->linear;
132
133                 if (sed_cmd->beg_match) {
134                         regfree(sed_cmd->beg_match);
135                         free(sed_cmd->beg_match);
136                 }
137                 if (sed_cmd->end_match) {
138                         regfree(sed_cmd->end_match);
139                         free(sed_cmd->end_match);
140                 }
141                 if (sed_cmd->sub_match) {
142                         regfree(sed_cmd->sub_match);
143                         free(sed_cmd->sub_match);
144                 }
145                 free(sed_cmd->replace);
146                 free(sed_cmd);
147                 sed_cmd = sed_cmd_next;
148         }
149 }
150 #endif
151
152 /*
153  * index_of_next_unescaped_regexp_delim - walks left to right through a string
154  * beginning at a specified index and returns the index of the next regular
155  * expression delimiter (typically a forward * slash ('/')) not preceeded by 
156  * a backslash ('\').
157  */
158 static int index_of_next_unescaped_regexp_delim(const char delimiter, const char *str)
159 {
160         int bracket = -1;
161         int escaped = 0;
162         int idx = 0;
163         char ch;
164
165         for ( ; (ch = str[idx]); idx++) {
166                 if (bracket != -1) {
167                         if (ch == ']' && !(bracket == idx - 1 ||
168                                         (bracket == idx - 2 && str[idx-1] == '^')))
169                                 bracket = -1;
170                 } else if (escaped)
171                         escaped = 0;
172                 else if (ch == '\\')
173                         escaped = 1;
174                 else if (ch == '[')
175                         bracket = idx;
176                 else if (ch == delimiter)
177                         return idx;
178         }
179
180         /* if we make it to here, we've hit the end of the string */
181         return -1;
182 }
183
184 static int parse_regex_delim(const char *cmdstr, char **match, char **replace)
185 {
186         const char *cmdstr_ptr = cmdstr;
187         char delimiter;
188         int idx = 0;
189
190         /* verify that the 's' is followed by something.  That something
191          * (typically a 'slash') is now our regexp delimiter... */
192         if (*cmdstr == '\0')
193                 bb_error_msg_and_die(bad_format_in_subst);
194         else
195                 delimiter = *cmdstr_ptr;
196
197         cmdstr_ptr++;
198
199         /* save the match string */
200         idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr);
201         if (idx == -1) {
202                 bb_error_msg_and_die(bad_format_in_subst);
203         }
204         *match = bb_xstrndup(cmdstr_ptr, idx);
205
206         /* save the replacement string */
207         cmdstr_ptr += idx + 1;
208         idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr);
209         if (idx == -1) {
210                 bb_error_msg_and_die(bad_format_in_subst);
211         }
212         *replace = bb_xstrndup(cmdstr_ptr, idx);
213
214         return((cmdstr_ptr - cmdstr) + idx);
215 }
216
217 /*
218  * returns the index in the string just past where the address ends.
219  */
220 static int get_address(char *my_str, int *linenum, regex_t **regex)
221 {
222         int idx = 0;
223         if (isdigit(my_str[idx])) {
224                 char *endstr;
225                 *linenum = strtol(my_str, &endstr, 10);
226                 /* endstr shouldnt ever equal NULL */
227                 idx = endstr - my_str;
228         }
229         else if (my_str[idx] == '$') {
230                 *linenum = -1;
231                 idx++;
232         }
233         else if (my_str[idx] == '/' || my_str[idx] == '\\') {
234                 int idx_start = 1;
235                 char delimiter;
236
237                 delimiter = '/';
238                 if (my_str[idx] == '\\') {
239                         idx_start++;
240                         delimiter = my_str[++idx];
241                 }
242                 idx++;
243                 idx += index_of_next_unescaped_regexp_delim(delimiter, my_str + idx);
244                 if (idx == -1) {
245                         bb_error_msg_and_die("unterminated match expression");
246                 }
247                 my_str[idx] = '\0';
248                 *regex = (regex_t *)xmalloc(sizeof(regex_t));
249                 xregcomp(*regex, my_str+idx_start, REG_NEWLINE);
250                 idx++; /* so it points to the next character after the last '/' */
251         }
252         return idx;
253 }
254
255 static int parse_subst_cmd(sed_cmd_t * const sed_cmd, const char *substr)
256 {
257         int cflags = 0;
258         char *match;
259         int idx = 0;
260         int j;
261
262         /*
263          * the string that gets passed to this function should look like this:
264          *    s/match/replace/gIp
265          *    ||     |        |||
266          *    mandatory       optional
267          *
268          *    (all three of the '/' slashes are mandatory)
269          */
270
271         idx = parse_regex_delim(substr, &match, &sed_cmd->replace);
272
273         /* determine the number of back references in the match string */
274         /* Note: we compute this here rather than in the do_subst_command()
275          * function to save processor time, at the expense of a little more memory
276          * (4 bits) per sed_cmd */
277         
278         /* sed_cmd->num_backrefs = 0; */ /* XXX: not needed? --apparently not */ 
279         for (j = 0; match[j]; j++) {
280                 /* GNU/POSIX sed does not save more than nine backrefs */
281                 if (match[j] == '\\' && match[j+1] == '(' && sed_cmd->num_backrefs <= 9)
282                         sed_cmd->num_backrefs++;
283         }
284
285         /* process the flags */
286         while (substr[++idx]) {
287                 switch (substr[idx]) {
288                         case 'g':
289                                 sed_cmd->sub_g = 1;
290                                 break;
291                         /* Hmm, i dont see the I option mentioned in the standard */
292                         case 'I':
293                                 cflags |= REG_ICASE;
294                                 break;
295                         case 'p':
296                                 sed_cmd->sub_p = 1;
297                                 break;
298                         default:
299                                 /* any whitespace or semicolon trailing after a s/// is ok */
300                                 if (strchr(semicolon_whitespace, substr[idx]))
301                                         goto out;
302                                 /* else */
303                                 bb_error_msg_and_die("bad option in substitution expression");
304                 }
305         }
306
307 out:    
308         /* compile the match string into a regex */
309         sed_cmd->sub_match = (regex_t *)xmalloc(sizeof(regex_t));
310         xregcomp(sed_cmd->sub_match, match, cflags);
311         free(match);
312
313         return idx;
314 }
315
316 static void replace_slash_n(char *string)
317 {
318         int i;
319         int remaining = strlen(string);
320
321         for (i = 0; string[i]; i++) {
322                 if ((string[i] == '\\') && (string[i + 1] == 'n')) {
323                         string[i] = '\n';
324                         memmove(string + i + 1, string + i + 1, remaining - 1);
325                 } else {
326                         remaining--;
327                 }
328         }
329 }
330
331 static int parse_translate_cmd(sed_cmd_t * const sed_cmd, const char *cmdstr)
332 {
333         char *match;
334         char *replace;
335         int idx;
336         int i;
337
338         idx = parse_regex_delim(cmdstr, &match, &replace);
339         replace_slash_n(match);
340         replace_slash_n(replace);
341         sed_cmd->translate = xcalloc(1, (strlen(match) + 1) * 2);
342         for (i = 0; (match[i] != 0) && (replace[i] != 0); i++) {
343                 sed_cmd->translate[i * 2] = match[i];
344                 sed_cmd->translate[(i * 2) + 1] = replace[i];
345         }
346         return(idx + 1);
347 }
348
349 static int parse_edit_cmd(sed_cmd_t *sed_cmd, const char *editstr)
350 {
351         int i, j;
352
353         /*
354          * the string that gets passed to this function should look like this:
355          *
356          *    need one of these 
357          *    |
358          *    |    this backslash (immediately following the edit command) is mandatory
359          *    |    |
360          *    [aic]\
361          *    TEXT1\
362          *    TEXT2\
363          *    TEXTN
364          *
365          * as soon as we hit a TEXT line that has no trailing '\', we're done.
366          * this means a command like:
367          *
368          * i\
369          * INSERTME
370          *
371          * is a-ok.
372          *
373          */
374         if ((*editstr != '\\') || ((editstr[1] != '\n') && (editstr[1] != '\r'))) {
375                 bb_error_msg_and_die("bad format in edit expression");
376         }
377
378         /* store the edit line text */
379         sed_cmd->editline = xmalloc(strlen(&editstr[2]) + 2);
380         for (i = 2, j = 0; editstr[i] != '\0' && strchr("\r\n", editstr[i]) == NULL;
381                         i++, j++) {
382                 if ((editstr[i] == '\\') && strchr("\n\r", editstr[i+1]) != NULL) {
383                         sed_cmd->editline[j] = '\n';
384                         i++;
385                 } else
386                         sed_cmd->editline[j] = editstr[i];
387         }
388
389         /* figure out if we need to add a newline */
390         if (sed_cmd->editline[j-1] != '\n')
391                 sed_cmd->editline[j++] = '\n';
392
393         /* terminate string */
394         sed_cmd->editline[j] = '\0';
395
396         return i;
397 }
398
399
400 static int parse_file_cmd(sed_cmd_t *sed_cmd, const char *filecmdstr)
401 {
402         int idx = 0;
403         int filenamelen = 0;
404
405         /*
406          * the string that gets passed to this function should look like this:
407          *    '[ ]filename'
408          *      |  |
409          *      |  a filename
410          *      |
411          *     optional whitespace
412
413          *   re: the file to be read, the GNU manual says the following: "Note that
414          *   if filename cannot be read, it is treated as if it were an empty file,
415          *   without any error indication." Thus, all of the following commands are
416          *   perfectly leagal:
417          *
418          *   sed -e '1r noexist'
419          *   sed -e '1r ;'
420          *   sed -e '1r'
421          */
422
423         /* the file command may be followed by whitespace; move past it. */
424         while (isspace(filecmdstr[++idx]))
425                 { ; }
426                 
427         /* the first non-whitespace we get is a filename. the filename ends when we
428          * hit a normal sed command terminator or end of string */
429         filenamelen = strcspn(&filecmdstr[idx], semicolon_whitespace);
430         sed_cmd->filename = xmalloc(filenamelen + 1);
431         safe_strncpy(sed_cmd->filename, &filecmdstr[idx], filenamelen + 1);
432
433         return idx + filenamelen;
434 }
435
436 /*
437  *  Process the commands arguments
438  */
439 static char *parse_cmd_str(sed_cmd_t * const sed_cmd, char *cmdstr)
440 {
441         /* handle (s)ubstitution command */
442         if (sed_cmd->cmd == 's') {
443                 cmdstr += parse_subst_cmd(sed_cmd, cmdstr);
444         }
445         /* handle edit cmds: (a)ppend, (i)nsert, and (c)hange */
446         else if (strchr("aic", sed_cmd->cmd)) {
447                 if ((sed_cmd->end_line || sed_cmd->end_match) && sed_cmd->cmd != 'c')
448                         bb_error_msg_and_die("only a beginning address can be specified for edit commands");
449                 cmdstr += parse_edit_cmd(sed_cmd, cmdstr);
450         }
451         /* handle file cmds: (r)ead */
452         else if (sed_cmd->cmd == 'r') {
453                 if (sed_cmd->end_line || sed_cmd->end_match)
454                         bb_error_msg_and_die("Command only uses one address");
455                 cmdstr += parse_file_cmd(sed_cmd, cmdstr);
456         }
457         /* handle branch commands */
458         else if (strchr(":bt", sed_cmd->cmd)) {
459                 int length;
460
461                 cmdstr += strspn(cmdstr, " ");
462                 length = strcspn(cmdstr, "; ");
463                 if (length > SED_LABEL_LENGTH) {
464                         length = SED_LABEL_LENGTH;
465                 }
466                 strncpy(sed_cmd->label, cmdstr, length);
467         cmdstr += length;
468         }
469         /* translation command */
470         else if (sed_cmd->cmd == 'y') {
471                 cmdstr += parse_translate_cmd(sed_cmd, cmdstr);
472         }
473         /* if it wasnt a single-letter command that takes no arguments
474          * then it must be an invalid command.
475          */
476         else if (strchr("nNpPqd=", sed_cmd->cmd) == 0) {
477                 bb_error_msg_and_die("Unsupported command %c", sed_cmd->cmd);
478         }
479
480         /* give back whatever's left over */
481         return(cmdstr);
482 }
483
484 static char *add_cmd(sed_cmd_t *sed_cmd, char *cmdstr)
485 {
486         /* Skip over leading whitespace and semicolons */
487         cmdstr += strspn(cmdstr, semicolon_whitespace);
488
489         /* if we ate the whole thing, that means there was just trailing
490          * whitespace or a final / no-op semicolon. either way, get out */
491         if (*cmdstr == '\0') {
492                 return(NULL);
493         }
494
495         /* if this is a comment, jump past it and keep going */
496         if (*cmdstr == '#') {
497                 return(strpbrk(cmdstr, "\n\r"));
498         }
499
500         /* parse the command
501          * format is: [addr][,addr]cmd
502          *            |----||-----||-|
503          *            part1 part2  part3
504          */
505
506         /* first part (if present) is an address: either a '$', a number or a /regex/ */
507         cmdstr += get_address(cmdstr, &sed_cmd->beg_line, &sed_cmd->beg_match);
508
509         /* second part (if present) will begin with a comma */
510         if (*cmdstr == ',') {
511                 int idx;
512                 cmdstr++;
513                 idx = get_address(cmdstr, &sed_cmd->end_line, &sed_cmd->end_match);
514                 if (idx == 0) {
515                         bb_error_msg_and_die("get_address: no address found in string\n"
516                                 "\t(you probably didn't check the string you passed me)");
517                 }
518                 cmdstr += idx;
519         }
520
521         /* skip whitespace before the command */
522         while (isspace(*cmdstr)) {
523                 cmdstr++;
524         }
525
526         /* there my be the inversion flag between part2 and part3 */
527         if (*cmdstr == '!') {
528                 sed_cmd->invert = 1;
529                 cmdstr++;
530
531 #ifdef SED_FEATURE_STRICT_CHECKING
532                 /* According to the spec
533                  * It is unspecified whether <blank>s can follow a '!' character,
534                  * and conforming applications shall not follow a '!' character
535                  * with <blank>s.
536                  */
537                 if (isblank(cmdstr[idx]) {
538                         bb_error_msg_and_die("blank follows '!'");
539                 }
540 #else 
541                 /* skip whitespace before the command */
542                 while (isspace(*cmdstr)) {
543                         cmdstr++;
544                 }
545 #endif
546         }
547
548         /* last part (mandatory) will be a command */
549         if (*cmdstr == '\0')
550                 bb_error_msg_and_die("missing command");
551
552         sed_cmd->cmd = *cmdstr;
553         cmdstr++;
554
555         if (sed_cmd->cmd == '{') {
556                 do {
557                         sed_cmd_t *sed_cmd_new;
558                         char *end_ptr = strpbrk(cmdstr, ";}");
559
560                         *end_ptr = '\0';
561                         sed_cmd_new = xcalloc(1, sizeof(sed_cmd_t));
562                         sed_cmd_new->beg_match = sed_cmd->beg_match;
563                         sed_cmd_new->end_match = sed_cmd->end_match;
564                         sed_cmd_new->beg_line = sed_cmd->beg_line;
565                         sed_cmd_new->end_line = sed_cmd->end_line;
566                         sed_cmd_new->invert = sed_cmd->invert;
567
568                         add_cmd(sed_cmd_new, cmdstr);
569                         cmdstr = end_ptr + 1;
570                 } while (*cmdstr != '\0');
571         } else {
572                 cmdstr = parse_cmd_str(sed_cmd, cmdstr);
573
574                 /* Add the command to the command array */
575                 sed_cmd_tail->linear = sed_cmd;
576                 sed_cmd_tail = sed_cmd_tail->linear;
577         }
578         return(cmdstr);
579 }
580
581 static void add_cmd_str(char *cmdstr)
582 {
583 #ifdef CONFIG_FEATURE_SED_EMBEDED_NEWLINE
584         char *cmdstr_ptr = cmdstr;
585
586         /* HACK: convert "\n" to match tranlated '\n' string */
587         while((cmdstr_ptr = strstr(cmdstr_ptr, "\\n")) != NULL) {
588                 cmdstr = xrealloc(cmdstr, strlen(cmdstr) + 2);
589                 cmdstr_ptr = strstr(cmdstr, "\\n");
590                 memmove(cmdstr_ptr + 1, cmdstr_ptr, strlen(cmdstr_ptr) + 1);
591                 cmdstr_ptr[0] = '\\';
592                 cmdstr_ptr += 3;
593         }
594 #endif
595         do {
596                 sed_cmd_t *sed_cmd;
597                 sed_cmd = xcalloc(1, sizeof(sed_cmd_t));
598                 cmdstr = add_cmd(sed_cmd, cmdstr);
599         } while (cmdstr && strlen(cmdstr));
600 }
601
602
603 static void load_cmd_file(char *filename)
604 {
605         FILE *cmdfile;
606         char *line;
607         char *nextline;
608         char *e;
609
610         cmdfile = bb_xfopen(filename, "r");
611
612         while ((line = bb_get_line_from_file(cmdfile)) != NULL) {
613                 /* if a line ends with '\' it needs the next line appended to it */
614                 while (((e = last_char_is(line, '\n')) != NULL)
615                            && (e > line) && (e[-1] == '\\')
616                            && ((nextline = bb_get_line_from_file(cmdfile)) != NULL)) {
617                         line = xrealloc(line, (e - line) + 1 + strlen(nextline) + 1);
618                         strcat(line, nextline);
619                         free(nextline);
620                 }
621                 /* eat trailing newline (if any) --if I don't do this, edit commands
622                  * (aic) will print an extra newline */
623                 chomp(line);
624                 add_cmd_str(line);
625                 free(line);
626         }
627 }
628
629 struct pipeline {
630         char *buf;
631         int idx;
632         int len;
633 };
634
635 #define PIPE_MAGIC 0x7f
636 #define PIPE_GROW 64  
637
638 void pipe_putc(struct pipeline *const pipeline, char c)
639 {
640         if (pipeline->buf[pipeline->idx] == PIPE_MAGIC) {
641                 pipeline->buf =
642                         xrealloc(pipeline->buf, pipeline->len + PIPE_GROW);
643                 memset(pipeline->buf + pipeline->len, 0, PIPE_GROW);
644                 pipeline->len += PIPE_GROW;
645                 pipeline->buf[pipeline->len - 1] = PIPE_MAGIC;
646         }
647         pipeline->buf[pipeline->idx++] = (c);
648 }
649
650 #define pipeputc(c)     pipe_putc(pipeline, c)
651
652 #if 0
653 { if (pipeline[pipeline_idx] == PIPE_MAGIC) { \
654         pipeline = xrealloc(pipeline, pipeline_len+PIPE_GROW); \
655         memset(pipeline+pipeline_len, 0, PIPE_GROW); \
656         pipeline_len += PIPE_GROW; \
657         pipeline[pipeline_len-1] = PIPE_MAGIC; } \
658         pipeline[pipeline_idx++] = (c); }
659 #endif
660
661 static void print_subst_w_backrefs(const char *line, const char *replace, 
662         regmatch_t *regmatch, struct pipeline *const pipeline, int matches)
663 {
664         int i;
665
666         /* go through the replacement string */
667         for (i = 0; replace[i]; i++) {
668                 /* if we find a backreference (\1, \2, etc.) print the backref'ed * text */
669                 if (replace[i] == '\\' && isdigit(replace[i+1])) {
670                         int j;
671                         char tmpstr[2];
672                         int backref;
673                         ++i; /* i now indexes the backref number, instead of the leading slash */
674                         tmpstr[0] = replace[i];
675                         tmpstr[1] = 0;
676                         backref = atoi(tmpstr);
677                         /* print out the text held in regmatch[backref] */
678                         if (backref <= matches && regmatch[backref].rm_so != -1)
679                                 for (j = regmatch[backref].rm_so; j < regmatch[backref].rm_eo; j++)
680                                         pipeputc(line[j]);
681                 }
682
683                 /* if we find a backslash escaped character, print the character */
684                 else if (replace[i] == '\\') {
685                         ++i;
686                         pipeputc(replace[i]);
687                 }
688
689                 /* if we find an unescaped '&' print out the whole matched text.
690                  * fortunately, regmatch[0] contains the indicies to the whole matched
691                  * expression (kinda seems like it was designed for just such a
692                  * purpose...) */
693                 else if (replace[i] == '&' && replace[i-1] != '\\') {
694                         int j;
695                         for (j = regmatch[0].rm_so; j < regmatch[0].rm_eo; j++)
696                                 pipeputc(line[j]);
697                 }
698                 /* nothing special, just print this char of the replacement string to stdout */
699                 else
700                         pipeputc(replace[i]);
701         }
702 }
703
704 static int do_subst_command(const sed_cmd_t *sed_cmd, char **line)
705 {
706         char *hackline = *line;
707         struct pipeline thepipe = { NULL, 0 , 0};
708         struct pipeline *const pipeline = &thepipe;
709         int altered = 0;
710         regmatch_t *regmatch = NULL;
711
712         /* we only proceed if the substitution 'search' expression matches */
713         if (regexec(sed_cmd->sub_match, hackline, 0, NULL, 0) == REG_NOMATCH)
714                 return 0;
715
716         /* whaddaya know, it matched. get the number of back references */
717         regmatch = xmalloc(sizeof(regmatch_t) * (sed_cmd->num_backrefs+1));
718
719         /* allocate more PIPE_GROW bytes
720            if replaced string is larger than original */
721         thepipe.len = strlen(hackline)+PIPE_GROW;
722         thepipe.buf = xcalloc(1, thepipe.len);
723         /* buffer magic */
724         thepipe.buf[thepipe.len-1] = PIPE_MAGIC;
725
726         /* and now, as long as we've got a line to try matching and if we can match
727          * the search string, we make substitutions */
728         while ((*hackline || !altered) && (regexec(sed_cmd->sub_match, hackline,
729                                         sed_cmd->num_backrefs+1, regmatch, 0) != REG_NOMATCH) ) {
730                 int i;
731
732                 /* print everything before the match */
733                 for (i = 0; i < regmatch[0].rm_so; i++)
734                         pipeputc(hackline[i]);
735
736                 /* then print the substitution string */
737                 print_subst_w_backrefs(hackline, sed_cmd->replace, regmatch, 
738                                 pipeline, sed_cmd->num_backrefs);
739
740                 /* advance past the match */
741                 hackline += regmatch[0].rm_eo;
742                 /* flag that something has changed */
743                 altered++;
744
745                 /* if we're not doing this globally, get out now */
746                 if (!sed_cmd->sub_g)
747                         break;
748         }
749
750         for (; *hackline; hackline++) pipeputc(*hackline);
751         if (thepipe.buf[thepipe.idx] == PIPE_MAGIC) thepipe.buf[thepipe.idx] = 0;
752
753         /* cleanup */
754         free(regmatch);
755
756         free(*line);
757         *line = thepipe.buf;
758         return altered;
759 }
760
761 static sed_cmd_t *branch_to(const char *label)
762 {
763         sed_cmd_t *sed_cmd;
764         for(sed_cmd = sed_cmd_head.linear; sed_cmd; sed_cmd = sed_cmd->linear) {
765                 if (strcmp(sed_cmd->label, label) == 0) {
766                         break;
767                 }
768         }
769
770         /* If no match returns last command */
771         return(sed_cmd);
772 }
773
774 static void process_file(FILE *file)
775 {
776         char *pattern_space;    /* Posix requires it be able to hold at least 8192 bytes */
777         static int linenum = 0; /* GNU sed does not restart counting lines at EOF */
778         unsigned int still_in_range = 0;
779         int altered;
780         int force_print;
781
782         pattern_space = bb_get_chomped_line_from_file(file);
783         if (pattern_space == NULL) {
784                 return;
785         }
786
787         /* go through every line in the file */
788         do {
789                 char *next_line;
790                 sed_cmd_t *sed_cmd;
791                 int substituted = 0;
792
793                 /* Read one line in advance so we can act on the last line, the '$' address */
794                 next_line = bb_get_chomped_line_from_file(file);
795
796                 linenum++;
797                 altered = 0;
798                 force_print = 0;
799
800                 /* for every line, go through all the commands */
801                 for (sed_cmd = sed_cmd_head.linear; sed_cmd; sed_cmd = sed_cmd->linear) {
802                         int deleted = 0;
803
804                         /*
805                          * entry point into sedding...
806                          */
807                         int matched = (
808                                         /* no range necessary */
809                                         (sed_cmd->beg_line == 0 && sed_cmd->end_line == 0 &&
810                                          sed_cmd->beg_match == NULL &&
811                                          sed_cmd->end_match == NULL) ||
812                                         /* this line number is the first address we're looking for */
813                                         (sed_cmd->beg_line && (sed_cmd->beg_line == linenum)) ||
814                                         /* this line matches our first address regex */
815                                         (sed_cmd->beg_match && (regexec(sed_cmd->beg_match, pattern_space, 0, NULL, 0) == 0)) ||
816                                         /* we are currently within the beginning & ending address range */
817                                         still_in_range || ((sed_cmd->beg_line == -1) && (next_line == NULL))
818                            );
819
820                         if (sed_cmd->invert ^ matched) {
821
822                                 /*
823                                  * actual sedding
824                                  */
825                                 switch (sed_cmd->cmd) {
826                                         case '=':
827                                                 printf("%d\n", linenum);
828                                                 break;
829                                         case 'P': {     /* Write the current pattern space upto the first newline */
830                                                         char *tmp = strchr(pattern_space, '\n');
831                                                         if (tmp) {
832                                                                 *tmp = '\0';
833                                                         }
834                                                 }
835                                         case 'p':       /* Write the current pattern space to output */
836                                                 puts(pattern_space);
837                                                 break;
838                                         case 'd':
839                                                 altered++;
840                                                 deleted = 1;
841                                                 break;
842
843                                         case 's':
844
845                                                 /*
846                                                  * Some special cases for 's' printing to make it compliant with
847                                                  * GNU sed printing behavior (aka "The -n | s///p Matrix"):
848                                                  *
849                                                  *    -n ONLY = never print anything regardless of any successful
850                                                  *    substitution
851                                                  *
852                                                  *    s///p ONLY = always print successful substitutions, even if
853                                                  *    the pattern_space is going to be printed anyway (pattern_space
854                                                  *    will be printed twice).
855                                                  *
856                                                  *    -n AND s///p = print ONLY a successful substitution ONE TIME;
857                                                  *    no other lines are printed - this is the reason why the 'p'
858                                                  *    flag exists in the first place.
859                                                  */
860
861 #ifdef CONFIG_FEATURE_SED_EMBEDED_NEWLINE
862                                                 /* HACK: escape newlines twice so regex can match them */
863                                                 {
864                                                         int offset = 0;
865                                                         while(strchr(pattern_space + offset, '\n') != NULL) {
866                                                                 char *tmp;
867                                                                 pattern_space = xrealloc(pattern_space, strlen(pattern_space) + 2);
868                                                                 tmp = strchr(pattern_space + offset, '\n');
869                                                                 memmove(tmp + 1, tmp, strlen(tmp) + 1);
870                                                                 tmp[0] = '\\';
871                                                                 tmp[1] = 'n';
872                                                                 offset = tmp - pattern_space + 2;
873                                                         }
874                                                 }
875 #endif
876                                                 /* we print the pattern_space once, unless we were told to be quiet */
877                                                 substituted = do_subst_command(sed_cmd, &pattern_space);
878
879 #ifdef CONFIG_FEATURE_SED_EMBEDED_NEWLINE
880                                                 /* undo HACK: escape newlines twice so regex can match them */
881                                                 {
882                                                         char *tmp = pattern_space;
883
884                                                         while((tmp = strstr(tmp, "\\n")) != NULL) {
885                                                                 memmove(tmp, tmp + 1, strlen(tmp + 1) + 1);
886                                                                 tmp[0] = '\n';
887                                                         }
888                                                 }
889 #endif
890                                                 altered |= substituted;
891                                                 if (!be_quiet && altered && ((sed_cmd->linear == NULL) || (sed_cmd->linear->cmd != 's'))) {
892                                                         force_print = 1;
893                                                 }
894
895                                                 /* we also print the line if we were given the 'p' flag
896                                                  * (this is quite possibly the second printing) */
897                                                 if ((sed_cmd->sub_p) && altered) {
898                                                         puts(pattern_space);
899                                                 }
900                                                 break;
901                                         case 'a':
902                                                 puts(pattern_space);
903                                                 fputs(sed_cmd->editline, stdout);
904                                                 altered++;
905                                                 break;
906
907                                         case 'i':
908                                                 fputs(sed_cmd->editline, stdout);
909                                                 break;
910
911                                         case 'c':
912                                                 /* single-address case */
913                                                 if ((sed_cmd->end_match == NULL && sed_cmd->end_line == 0)
914                                                 /* multi-address case */
915                                                 /* - matching text */
916                                                 || (sed_cmd->end_match && (regexec(sed_cmd->end_match, pattern_space, 0, NULL, 0) == 0))
917                                                 /* - matching line numbers */
918                                                 || (sed_cmd->end_line > 0 && sed_cmd->end_line == linenum))
919                                                 {
920                                                         fputs(sed_cmd->editline, stdout);
921                                                 }
922                                                 altered++;
923
924                                                 break;
925
926                                         case 'r': {
927                                                         FILE *outfile;
928                                                         puts(pattern_space);
929                                                         outfile = fopen(sed_cmd->filename, "r");
930                                                         if (outfile)
931                                                                 bb_xprint_and_close_file(outfile);
932                                                                 /* else if we couldn't open the output file,
933                                                                  * no biggie, just don't print anything */
934                                                                 altered++;
935                                                 }
936                                                 break;
937                                         case 'q':       /* Branch to end of script and quit */
938                                                 deleted = 1;
939                                                 /* Exit the outer while loop */
940                                                 free(next_line);
941                                                 next_line = NULL;
942                                                 break;
943                                         case 'n':       /* Read next line from input */
944                                                 free(pattern_space);
945                                                 pattern_space = next_line;
946                                                 next_line = bb_get_chomped_line_from_file(file);
947                                                 linenum++;
948                                                 break;
949                                         case 'N':       /* Append the next line to the current line */
950                                                 if (next_line) {
951                                                         pattern_space = realloc(pattern_space, strlen(pattern_space) + strlen(next_line) + 2);
952                                                         strcat(pattern_space, "\n");
953                                                         strcat(pattern_space, next_line);
954                                                         next_line = bb_get_chomped_line_from_file(file);
955                                                         linenum++;
956                                                 }
957                                                 break;
958                                         case 'b':
959                                                 sed_cmd = branch_to(sed_cmd->label);
960                                                 break;
961                                         case 't':
962                                                 if (substituted) {
963                                                         sed_cmd = branch_to(sed_cmd->label);
964                                                 }
965                                                 break;
966                                         case 'y': {
967                                                         int i;
968                                                         for (i = 0; pattern_space[i] != 0; i++) {
969                                                                 int j;
970                                                                 for (j = 0; sed_cmd->translate[j] ;j += 2) {
971                                                                         if (pattern_space[i] == sed_cmd->translate[j]) {
972                                                                                 pattern_space[i] = sed_cmd->translate[j + 1];
973                                                                         }
974                                                                 }
975                                                         }
976                                                 }
977                                                 break;
978                                 }
979                         }
980
981                         /*
982                          * exit point from sedding...
983                          */
984                         if (matched) {
985                                 if (
986                                         /* this is a single-address command or... */
987                                         (sed_cmd->end_line == 0 && sed_cmd->end_match == NULL) || (
988                                                 /* we were in the middle of our address range (this
989                                                  * isn't the first time through) and.. */
990                                                 (still_in_range == 1) && (
991                                                         /* this line number is the last address we're looking for or... */
992                                                         (sed_cmd->end_line && (sed_cmd->end_line == linenum)) ||
993                                                         /* this line matches our last address regex */
994                                                         (sed_cmd->end_match && (regexec(sed_cmd->end_match, pattern_space, 0, NULL, 0) == 0))
995                                                 )
996                                         )
997                                 ) {
998                                         /* we're out of our address range */
999                                         still_in_range = 0;
1000                                 }
1001
1002                                 /* didn't hit the exit? then we're still in the middle of an address range */
1003                                 else {
1004                                         still_in_range = 1;
1005                                 }
1006                         }
1007
1008                         if (deleted)
1009                                 break;
1010                 }
1011
1012                 /* we will print the line unless we were told to be quiet or if the
1013                  * line was altered (via a 'd'elete or 's'ubstitution), in which case
1014                  * the altered line was already printed */
1015                 if ((!be_quiet && !altered) || force_print){
1016                         puts(pattern_space);
1017                 }
1018                 free(pattern_space);
1019                 pattern_space = next_line;
1020         } while (pattern_space);
1021 }
1022
1023 extern int sed_main(int argc, char **argv)
1024 {
1025         int opt, status = EXIT_SUCCESS;
1026
1027 #ifdef CONFIG_FEATURE_CLEAN_UP
1028         /* destroy command strings on exit */
1029         if (atexit(destroy_cmd_strs) == -1)
1030                 bb_perror_msg_and_die("atexit");
1031 #endif
1032
1033         /* do normal option parsing */
1034         while ((opt = getopt(argc, argv, "ne:f:")) > 0) {
1035                 switch (opt) {
1036                         case 'n':
1037                                 be_quiet++;
1038                                 break;
1039                         case 'e': {
1040                                 char *str_cmd = strdup(optarg);
1041                                 add_cmd_str(str_cmd);
1042                                 free(str_cmd);
1043                                 break;
1044                         }
1045                         case 'f': 
1046                                 load_cmd_file(optarg);
1047                                 break;
1048                         default:
1049                                 bb_show_usage();
1050                 }
1051         }
1052
1053         /* if we didn't get a pattern from a -e and no command file was specified,
1054          * argv[optind] should be the pattern. no pattern, no worky */
1055         if (sed_cmd_head.linear == NULL) {
1056                 if (argv[optind] == NULL)
1057                         bb_show_usage();
1058                 else {
1059                         char *str_cmd = strdup(argv[optind]);
1060                         add_cmd_str(strdup(str_cmd));
1061                         free(str_cmd);
1062                         optind++;
1063                 }
1064         }
1065
1066         /* argv[(optind)..(argc-1)] should be names of file to process. If no
1067          * files were specified or '-' was specified, take input from stdin.
1068          * Otherwise, we process all the files specified. */
1069         if (argv[optind] == NULL || (strcmp(argv[optind], "-") == 0)) {
1070                 process_file(stdin);
1071         }
1072         else {
1073                 int i;
1074                 FILE *file;
1075                 for (i = optind; i < argc; i++) {
1076                         file = bb_wfopen(argv[i], "r");
1077                         if (file) {
1078                                 process_file(file);
1079                                 fclose(file);
1080                         } else
1081                                 status = EXIT_FAILURE;
1082                 }
1083         }
1084         
1085         return status;
1086 }