awk: don't append bogus data after NUL in sub(); shrink
authorDenys Vlasenko <vda.linux@googlemail.com>
Sat, 3 Apr 2010 23:17:30 +0000 (01:17 +0200)
committerDenys Vlasenko <vda.linux@googlemail.com>
Sat, 3 Apr 2010 23:17:30 +0000 (01:17 +0200)
also renamed variables to more sensible names

function                                             old     new   delta
mk_re_node                                            56      49      -7
awk_sub                                              601     591     -10

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
editors/awk.c

index 30c6b88ef7827c32c3238d7ffc294c4fba90783a..3ba1a422da9016f7ad7bfa879fcb73ff081e8a93 100644 (file)
@@ -1134,15 +1134,13 @@ static node *new_node(uint32_t info)
        return n;
 }
 
-static node *mk_re_node(const char *s, node *n, regex_t *re)
+static void mk_re_node(const char *s, node *n, regex_t *re)
 {
        n->info = OC_REGEXP;
        n->l.re = re;
        n->r.ire = re + 1;
        xregcomp(re, s, REG_EXTENDED);
        xregcomp(re + 1, s, REG_EXTENDED | REG_ICASE);
-
-       return n;
 }
 
 static node *condition(void)
@@ -1541,7 +1539,10 @@ static regex_t *as_regex(node *op, regex_t *preg)
        return preg;
 }
 
-/* gradually increasing buffer */
+/* gradually increasing buffer.
+ * note that we reallocate even if n == old_size,
+ * and thus there is at least one extra allocated byte.
+ */
 static char* qrealloc(char *b, int n, int *size)
 {
        if (!b || n >= *size) {
@@ -1983,83 +1984,100 @@ static char *awk_printf(node *n)
        return b;
 }
 
-/* common substitution routine
- * replace (nm) substring of (src) that match (n) with (repl), store
- * result into (dest), return number of substitutions. If nm=0, replace
- * all matches. If src or dst is NULL, use $0. If ex=TRUE, enable
- * subexpression matching (\1-\9)
+/* Common substitution routine.
+ * Replace (nm)'th substring of (src) that matches (rn) with (repl),
+ * store result into (dest), return number of substitutions.
+ * If nm = 0, replace all matches.
+ * If src or dst is NULL, use $0.
+ * If subexp != 0, enable subexpression matching (\1-\9).
  */
-static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest, int ex)
+static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest, int subexp)
 {
-       char *ds = NULL;
-       const char *s;
+       char *resbuf;
        const char *sp;
-       int c, i, j, di, rl, so, eo, nbs, n, dssize;
+       int match_no, residx, replen, resbufsize;
+       int regexec_flags;
        regmatch_t pmatch[10];
-       regex_t sreg, *re;
+       regex_t sreg, *regex;
+
+       resbuf = NULL;
+       residx = 0;
+       match_no = 0;
+       regexec_flags = 0;
+       regex = as_regex(rn, &sreg);
+       sp = getvar_s(src ? src : intvar[F0]);
+       replen = strlen(repl);
+       while (regexec(regex, sp, 10, pmatch, regexec_flags) == 0) {
+               int so = pmatch[0].rm_so;
+               int eo = pmatch[0].rm_eo;
+
+               //bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp);
+               resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize);
+               memcpy(resbuf + residx, sp, eo);
+               residx += eo;
+               if (++match_no >= nm) {
+                       const char *s;
+                       int nbs;
 
-       re = as_regex(rn, &sreg);
-       if (!src)
-               src = intvar[F0];
-       if (!dest)
-               dest = intvar[F0];
-
-       i = di = 0;
-       sp = getvar_s(src);
-       rl = strlen(repl);
-       while (regexec(re, sp, 10, pmatch, sp==getvar_s(src) ? 0 : REG_NOTBOL) == 0) {
-               so = pmatch[0].rm_so;
-               eo = pmatch[0].rm_eo;
-
-               ds = qrealloc(ds, di + eo + rl, &dssize);
-               memcpy(ds + di, sp, eo);
-               di += eo;
-               if (++i >= nm) {
                        /* replace */
-                       di -= (eo - so);
+                       residx -= (eo - so);
                        nbs = 0;
                        for (s = repl; *s; s++) {
-                               ds[di++] = c = *s;
+                               char c = resbuf[residx++] = *s;
                                if (c == '\\') {
                                        nbs++;
                                        continue;
                                }
-                               if (c == '&' || (ex && c >= '0' && c <= '9')) {
-                                       di -= ((nbs + 3) >> 1);
+                               if (c == '&' || (subexp && c >= '0' && c <= '9')) {
+                                       int j;
+                                       residx -= ((nbs + 3) >> 1);
                                        j = 0;
                                        if (c != '&') {
                                                j = c - '0';
                                                nbs++;
                                        }
                                        if (nbs % 2) {
-                                               ds[di++] = c;
+                                               resbuf[residx++] = c;
                                        } else {
-                                               n = pmatch[j].rm_eo - pmatch[j].rm_so;
-                                               ds = qrealloc(ds, di + rl + n, &dssize);
-                                               memcpy(ds + di, sp + pmatch[j].rm_so, n);
-                                               di += n;
+                                               int n = pmatch[j].rm_eo - pmatch[j].rm_so;
+                                               resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize);
+                                               memcpy(resbuf + residx, sp + pmatch[j].rm_so, n);
+                                               residx += n;
                                        }
                                }
                                nbs = 0;
                        }
                }
 
+               regexec_flags = REG_NOTBOL;
                sp += eo;
-               if (i == nm)
+               if (match_no == nm)
                        break;
                if (eo == so) {
-                       ds[di] = *sp++;
-                       if (!ds[di++])
-                               break;
+                       /* Empty match (e.g. "b*" will match anywhere).
+                        * Advance by one char. */
+//BUG (bug 1333):
+//gsub(/\<b*/,"") on "abc" will reach this point, advance to "bc"
+//... and will erroneously match "b" even though it is NOT at the word start.
+//we need REG_NOTBOW but it does not exist...
+                       /* Subtle: this is safe only because
+                        * qrealloc allocated at least one extra byte */
+                       resbuf[residx] = *sp;
+                       if (*sp == '\0')
+                               goto ret;
+                       sp++;
+                       residx++;
                }
        }
 
-       ds = qrealloc(ds, di + strlen(sp), &dssize);
-       strcpy(ds + di, sp);
-       setvar_p(dest, ds);
-       if (re == &sreg)
-               regfree(re);
-       return i;
+       resbuf = qrealloc(resbuf, residx + strlen(sp), &resbufsize);
+       strcpy(resbuf + residx, sp);
+ ret:
+       //bb_error_msg("end sp:'%s'%p", sp,sp);
+       setvar_p(dest ? dest : intvar[F0], resbuf);
+       if (regex == &sreg)
+               regfree(regex);
+       return match_no;
 }
 
 static NOINLINE int do_mktime(const char *ds)