ash: fix quadratic matching slowdown is ${v/*foo*/repl} (really bad one)
authorDenys Vlasenko <vda.linux@googlemail.com>
Sat, 13 Mar 2010 15:19:04 +0000 (16:19 +0100)
committerDenys Vlasenko <vda.linux@googlemail.com>
Sat, 13 Mar 2010 15:19:04 +0000 (16:19 +0100)
It is especially bad with patterns starting with "*".

With ASH_OPTIMIZE_FOR_SIZE=y, only those are optimized, +few bytes:
   text    data     bss     dec     hex filename
 836337     441    7564  844342   ce236 busybox_old
 836341     441    7564  844346   ce23a busybox_unstripped

With ASH_OPTIMIZE_FOR_SIZE off, we also optimize patterns _ending_ with "*",
which costs about 80 bytes:
   text    data     bss     dec     hex filename
 836656     441    7564  844661   ce375 busybox_old
 836732     441    7564  844737   ce3c1 busybox_unstripped

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
shell/ash.c

index 0a8b6c0f2e9d3bf34321caebfd5aa2e7de308496..ce82a965c226b393bcd4f20fb10f37398206c3eb 100644 (file)
@@ -6040,25 +6040,61 @@ scanleft(char *startp, char *rmesc, char *rmescend UNUSED_PARAM, char *str, int
 }
 
 static char *
-scanright(char *startp, char *rmesc, char *rmescend, char *str, int quotes,
-       int zero)
+scanright(char *startp, char *rmesc, char *rmescend, char *pattern, int quotes, int match_at_start)
 {
+#if !ENABLE_ASH_OPTIMIZE_FOR_SIZE
+       int try2optimize = match_at_start;
+#endif
        int esc = 0;
        char *loc;
        char *loc2;
 
-       for (loc = str - 1, loc2 = rmescend; loc >= startp; loc2--) {
+       /* If we called by "${v/pattern/repl}" or "${v//pattern/repl}":
+        * startp="escaped_value_of_v" rmesc="raw_value_of_v"
+        * rmescend=""(ptr to NUL in rmesc) pattern="pattern" quotes=match_at_start=1
+        * Logic:
+        * loc starts at NUL at the end of startp, loc2 starts at the end of rmesc,
+        * and on each iteration they go back two/one char until they reach the beginning.
+        * We try to find a match in "raw_value_of_v", "raw_value_of_", "raw_value_of" etc.
+        */
+       /* TODO: document in what other circumstances we are called. */
+
+       for (loc = pattern - 1, loc2 = rmescend; loc >= startp; loc2--) {
                int match;
                char c = *loc2;
                const char *s = loc2;
-               if (zero) {
+               if (match_at_start) {
                        *loc2 = '\0';
                        s = rmesc;
                }
-               match = pmatch(str, s);
+               match = pmatch(pattern, s);
+               //bb_error_msg("pmatch(pattern:'%s',s:'%s'):%d", pattern, s, match);
                *loc2 = c;
                if (match)
                        return loc;
+#if !ENABLE_ASH_OPTIMIZE_FOR_SIZE
+               if (try2optimize) {
+                       /* Maybe we can optimize this:
+                        * if pattern ends with unescaped *, we can avoid checking
+                        * shorter strings: if "foo*" doesnt match "raw_value_of_v",
+                        * it wont match truncated "raw_value_of_" strings too.
+                        */
+                       unsigned plen = strlen(pattern);
+                       /* Does it end with "*"? */
+                       if (plen != 0 && pattern[--plen] == '*') {
+                               /* "xxxx*" is not escaped */
+                               /* "xxx\*" is escaped */
+                               /* "xx\\*" is not escaped */
+                               /* "x\\\*" is escaped */
+                               int slashes = 0;
+                               while (plen != 0 && pattern[--plen] == '\\')
+                                       slashes++;
+                               if (!(slashes & 1))
+                                       break; /* ends with unescaped "*" */
+                       }
+                       try2optimize = 0;
+               }
+#endif
                loc--;
                if (quotes) {
                        if (--esc < 0) {
@@ -6248,7 +6284,7 @@ subevalvar(char *p, char *str, int strloc, int subtype,
 
 #if ENABLE_ASH_BASH_COMPAT
        if (subtype == VSREPLACE || subtype == VSREPLACEALL) {
-               char *idx, *end, *restart_detect;
+               char *idx, *end;
 
                if (!repl) {
                        repl = parse_sub_pattern(str, varflags & VSQUOTE);
@@ -6257,17 +6293,19 @@ subevalvar(char *p, char *str, int strloc, int subtype,
                }
 
                /* If there's no pattern to match, return the expansion unmolested */
-               if (*str == '\0')
+               if (str[0] == '\0')
                        return 0;
 
                len = 0;
                idx = startp;
                end = str - 1;
                while (idx < end) {
+ try_to_match:
                        loc = scanright(idx, rmesc, rmescend, str, quotes, 1);
                        if (!loc) {
                                /* No match, advance */
-                               restart_detect = stackblock();
+                               char *restart_detect = stackblock();
+ skip_matching:
                                STPUTC(*idx, expdest);
                                if (quotes && (unsigned char)*idx == CTLESC) {
                                        idx++;
@@ -6279,7 +6317,16 @@ subevalvar(char *p, char *str, int strloc, int subtype,
                                idx++;
                                len++;
                                rmesc++;
-                               continue;
+                               /* continue; - prone to quadratic behavior, smarter code: */
+                               if (idx >= end)
+                                       break;
+                               if (str[0] == '*') {
+                                       /* Pattern is "*foo". If "*foo" does not match "long_string",
+                                        * it would never match "ong_string" etc, no point in trying.
+                                        */
+                                       goto skip_matching;
+                               }
+                               goto try_to_match;
                        }
 
                        if (subtype == VSREPLACEALL) {
@@ -6294,7 +6341,7 @@ subevalvar(char *p, char *str, int strloc, int subtype,
                        }
 
                        for (loc = repl; *loc; loc++) {
-                               restart_detect = stackblock();
+                               char *restart_detect = stackblock();
                                STPUTC(*loc, expdest);
                                if (stackblock() != restart_detect)
                                        goto restart;
@@ -6303,7 +6350,7 @@ subevalvar(char *p, char *str, int strloc, int subtype,
 
                        if (subtype == VSREPLACE) {
                                while (*idx) {
-                                       restart_detect = stackblock();
+                                       char *restart_detect = stackblock();
                                        STPUTC(*idx, expdest);
                                        if (stackblock() != restart_detect)
                                                goto restart;
@@ -6332,7 +6379,7 @@ subevalvar(char *p, char *str, int strloc, int subtype,
        if (subtype < 0 || subtype > 7)
                abort();
 #endif
-       /* zero = subtype == VSTRIMLEFT || subtype == VSTRIMLEFTMAX */
+       /* zero = (subtype == VSTRIMLEFT || subtype == VSTRIMLEFTMAX) */
        zero = subtype >> 1;
        /* VSTRIMLEFT/VSTRIMRIGHTMAX -> scanleft */
        scan = (subtype & 1) ^ zero ? scanleft : scanright;