From ad47d45e9da8df364cb0a61b6146d51c196c8891 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Tue, 20 Mar 2012 19:44:05 -0400 Subject: [PATCH] upgrade to latest upstream TRE regex code (0.8.0) the main practical results of this change are 1. the regex code is no longer subject to LGPL; it's now 2-clause BSD 2. most (all?) popular nonstandard regex extensions are supported I hesitate to call this a "sync" since both the old and new code are heavily modified. in one sense, the old code was "more severely" modified, in that it was actively hostile to non-strictly-conforming expressions. on the other hand, the new code has eliminated the useless translation of the entire regex string to wchar_t prior to compiling, and now only converts multibyte character literals as needed. in the future i may use this modified TRE as a basis for writing the long-planned new regex engine that will avoid multibyte-to-wide character conversion entirely by compiling multibyte bracket expressions specific to UTF-8. --- src/regex/regcomp.c | 1597 ++++++++++++++++++++++-------------------- src/regex/regerror.c | 56 +- src/regex/regexec.c | 386 ++++------ src/regex/tre-mem.c | 67 +- src/regex/tre.h | 99 +-- 5 files changed, 1037 insertions(+), 1168 deletions(-) diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c index 875f56fd..8987f5aa 100644 --- a/src/regex/regcomp.c +++ b/src/regex/regcomp.c @@ -1,21 +1,31 @@ /* regcomp.c - TRE POSIX compatible regex compilation functions. - Copyright (c) 2001-2006 Ville Laurikari - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Copyright (c) 2001-2009 Ville Laurikari + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ @@ -30,6 +40,23 @@ #include +/*********************************************************************** + from tre-compile.h +***********************************************************************/ + +typedef struct { + int position; + int code_min; + int code_max; + int *tags; + int assertions; + tre_ctype_t class; + tre_ctype_t *neg_classes; + int backref; + int *params; +} tre_pos_and_tags_t; + + /*********************************************************************** from tre-ast.c and tre-ast.h ***********************************************************************/ @@ -54,17 +81,6 @@ typedef enum { #define IS_TAG(x) ((x)->code_min == TAG) #define IS_BACKREF(x) ((x)->code_min == BACKREF) -/* Taken from tre-compile.h */ -typedef struct { - int position; - int code_min; - int code_max; - int *tags; - int assertions; - tre_ctype_t class; - tre_ctype_t *neg_classes; - int backref; -} tre_pos_and_tags_t; /* A generic AST node. All AST nodes consist of this node on the top level with `obj' pointing to the actual content. */ @@ -87,7 +103,10 @@ typedef struct { long code_min; long code_max; int position; - tre_ctype_t class; + union { + tre_ctype_t class; + int *params; + } u; tre_ctype_t *neg_classes; } tre_literal_t; @@ -109,6 +128,10 @@ typedef struct { int min; /* Maximum number of consecutive matches. */ int max; + /* If 0, match as many characters as possible, if 1 match as few as + possible. Note that this does not always mean the same thing as + matching as many/few repetitions as possible. */ + unsigned int minimal:1; } tre_iteration_t; /* An "union" node. These are created for the "|" operator. */ @@ -117,6 +140,24 @@ typedef struct { tre_ast_node_t *right; } tre_union_t; +static tre_ast_node_t * +tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size); + +static tre_ast_node_t * +tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position); + +static tre_ast_node_t * +tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, + int minimal); + +static tre_ast_node_t * +tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right); + +static tre_ast_node_t * +tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, + tre_ast_node_t *right); + + static tre_ast_node_t * tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size) { @@ -153,7 +194,8 @@ tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position) } static tre_ast_node_t * -tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max) +tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, + int minimal) { tre_ast_node_t *node; tre_iteration_t *iter; @@ -165,6 +207,7 @@ tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max) iter->arg = arg; iter->min = min; iter->max = max; + iter->minimal = minimal; node->num_submatches = arg->num_submatches; return node; @@ -201,40 +244,82 @@ tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, return node; } + /*********************************************************************** from tre-stack.c and tre-stack.h ***********************************************************************/ +typedef struct tre_stack_rec tre_stack_t; + +/* Creates a new stack object. `size' is initial size in bytes, `max_size' + is maximum size, and `increment' specifies how much more space will be + allocated with realloc() if all space gets used up. Returns the stack + object or NULL if out of memory. */ +static tre_stack_t * +tre_stack_new(int size, int max_size, int increment); + +/* Frees the stack object. */ +static void +tre_stack_destroy(tre_stack_t *s); + +/* Returns the current number of objects in the stack. */ +static int +tre_stack_num_objects(tre_stack_t *s); + +/* Each tre_stack_push_*(tre_stack_t *s, value) function pushes + `value' on top of stack `s'. Returns REG_ESPACE if out of memory. + This tries to realloc() more space before failing if maximum size + has not yet been reached. Returns REG_OK if successful. */ +#define declare_pushf(typetag, type) \ + static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value) + +declare_pushf(voidptr, void *); +declare_pushf(int, int); + +/* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost + element off of stack `s' and returns it. The stack must not be + empty. */ +#define declare_popf(typetag, type) \ + static type tre_stack_pop_ ## typetag(tre_stack_t *s) + +declare_popf(voidptr, void *); +declare_popf(int, int); + /* Just to save some typing. */ -#define STACK_PUSH(s, value) \ +#define STACK_PUSH(s, typetag, value) \ do \ { \ - status = tre_stack_push(s, (void *)(value)); \ + status = tre_stack_push_ ## typetag(s, value); \ } \ - while (0) + while (/*CONSTCOND*/0) -#define STACK_PUSHX(s, value) \ +#define STACK_PUSHX(s, typetag, value) \ { \ - status = tre_stack_push(s, (void *)(value)); \ + status = tre_stack_push_ ## typetag(s, value); \ if (status != REG_OK) \ break; \ } -#define STACK_PUSHR(s, value) \ +#define STACK_PUSHR(s, typetag, value) \ { \ - reg_errcode_t status; \ - status = tre_stack_push(s, (void *)(value)); \ - if (status != REG_OK) \ - return status; \ + reg_errcode_t _status; \ + _status = tre_stack_push_ ## typetag(s, value); \ + if (_status != REG_OK) \ + return _status; \ } -typedef struct tre_stack_rec { +union tre_stack_item { + void *voidptr_value; + int int_value; +}; + +struct tre_stack_rec { int size; int max_size; int increment; int ptr; - void **stack; -} tre_stack_t; + union tre_stack_item *stack; +}; static tre_stack_t * @@ -273,7 +358,7 @@ tre_stack_num_objects(tre_stack_t *s) } static reg_errcode_t -tre_stack_push(tre_stack_t *s, void *value) +tre_stack_push(tre_stack_t *s, union tre_stack_item value) { if (s->ptr < s->size) { @@ -284,24 +369,20 @@ tre_stack_push(tre_stack_t *s, void *value) { if (s->size >= s->max_size) { - DPRINT(("tre_stack_push: stack full\n")); return REG_ESPACE; } else { - void **new_buffer; + union tre_stack_item *new_buffer; int new_size; - DPRINT(("tre_stack_push: trying to realloc more space\n")); new_size = s->size + s->increment; if (new_size > s->max_size) new_size = s->max_size; new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size); if (new_buffer == NULL) { - DPRINT(("tre_stack_push: realloc failed.\n")); return REG_ESPACE; } - DPRINT(("tre_stack_push: realloc succeeded.\n")); assert(new_size > s->size); s->size = new_size; s->stack = new_buffer; @@ -311,12 +392,24 @@ tre_stack_push(tre_stack_t *s, void *value) return REG_OK; } -static void * -tre_stack_pop(tre_stack_t *s) -{ - return s->stack[--s->ptr]; +#define define_pushf(typetag, type) \ + declare_pushf(typetag, type) { \ + union tre_stack_item item; \ + item.typetag ## _value = value; \ + return tre_stack_push(s, item); \ } +define_pushf(int, int) +define_pushf(voidptr, void *) + +#define define_popf(typetag, type) \ + declare_popf(typetag, type) { \ + return s->stack[--s->ptr].typetag ## _value; \ + } + +define_popf(int, int) +define_popf(voidptr, void *) + /*********************************************************************** from tre-parse.c and tre-parse.h @@ -331,24 +424,86 @@ typedef struct { /* The parse result. */ tre_ast_node_t *result; /* The regexp to parse and its length. */ - const tre_char_t *re; + const char *re; /* The first character of the entire regexp. */ - const tre_char_t *re_start; - /* The first character after the end of the regexp. */ - const tre_char_t *re_end; - int len; + const char *re_start; /* Current submatch ID. */ int submatch_id; /* Current position (number of literal). */ int position; /* The highest back reference or -1 if none seen so far. */ int max_backref; + /* This flag is set if the regexp uses approximate matching. */ + int have_approx; /* Compilation flags. */ int cflags; /* If this flag is set the top-level submatch is not captured. */ int nofirstsub; } tre_parse_ctx_t; +/* Parses a wide character regexp pattern into a syntax tree. This parser + handles both syntaxes (BRE and ERE), including the TRE extensions. */ +static reg_errcode_t +tre_parse(tre_parse_ctx_t *ctx); + + +/* + This parser is just a simple recursive descent parser for POSIX.2 + regexps. The parser supports both the obsolete default syntax and + the "extended" syntax, and some nonstandard extensions. +*/ + +/* Characters with special meanings in regexp syntax. */ +#define CHAR_PIPE '|' +#define CHAR_LPAREN '(' +#define CHAR_RPAREN ')' +#define CHAR_LBRACE '{' +#define CHAR_RBRACE '}' +#define CHAR_LBRACKET '[' +#define CHAR_RBRACKET ']' +#define CHAR_MINUS '-' +#define CHAR_STAR '*' +#define CHAR_QUESTIONMARK '?' +#define CHAR_PLUS '+' +#define CHAR_PERIOD '.' +#define CHAR_COLON ':' +#define CHAR_EQUAL '=' +#define CHAR_COMMA ',' +#define CHAR_CARET '^' +#define CHAR_DOLLAR '$' +#define CHAR_BACKSLASH '\\' +#define CHAR_HASH '#' +#define CHAR_TILDE '~' + + +/* Some macros for expanding \w, \s, etc. */ +static const struct tre_macro_struct { + const char c; + const char *expansion; +} tre_macros[] = + { {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, + {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, + {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, + {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, + { 0, NULL } + }; + + +/* Expands a macro delimited by `regex' and `regex_end' to `buf', which + must have at least `len' items. Sets buf[0] to zero if the there + is no match in `tre_macros'. */ +static const char * +tre_expand_macro(const char *regex) +{ + int i; + + if (!*regex) + return 0; + + for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++); + return tre_macros[i].expansion; +} + static reg_errcode_t tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, tre_ast_node_t ***items) @@ -359,7 +514,6 @@ tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, if (*i >= *max_i) { tre_ast_node_t **new_items; - DPRINT(("out of array space, i = %d\n", *i)); /* If the array is already 1024 items large, give up -- there's probably an error in the regexp (e.g. not a '\0' terminated string and missing ']') */ @@ -378,47 +532,11 @@ tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, } -/* Expands a character class to character ranges. */ -static reg_errcode_t -tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items, - int *i, int *max_i, int cflags) -{ - reg_errcode_t status = REG_OK; - tre_cint_t c; - int j, min = -1, max = 0; - assert(TRE_MB_CUR_MAX == 1); - - DPRINT((" expanding class to character ranges\n")); - for (j = 0; (j < 256) && (status == REG_OK); j++) - { - c = j; - if (tre_isctype(c, class) - || ((cflags & REG_ICASE) - && (tre_isctype(tre_tolower(c), class) - || tre_isctype(tre_toupper(c), class)))) -{ - if (min < 0) - min = c; - max = c; - } - else if (min >= 0) - { - DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max)); - status = tre_new_item(mem, min, max, i, max_i, items); - min = -1; - } - } - if (min >= 0 && status == REG_OK) - status = tre_new_item(mem, min, max, i, max_i, items); - return status; -} - - static int tre_compare_items(const void *a, const void *b) { - tre_ast_node_t *node_a = *(tre_ast_node_t **)a; - tre_ast_node_t *node_b = *(tre_ast_node_t **)b; + const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a; + const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b; tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj; int a_min = l_a->code_min, b_min = l_b->code_min; @@ -443,7 +561,7 @@ tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, tre_ast_node_t ***items, int *num_items, int *items_size) { - const tre_char_t *re = ctx->re; + const char *re = ctx->re; reg_errcode_t status = REG_OK; tre_ctype_t class = (tre_ctype_t)0; int i = *num_items; @@ -454,86 +572,56 @@ tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, while (status == REG_OK) { skip = 0; - if (re == ctx->re_end) + if (!*re) { status = REG_EBRACK; } - else if (*re == ']' && re > ctx->re) + else if (*re == CHAR_RBRACKET && re > ctx->re) { - DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", - ctx->re_end - re, re)); re++; break; } else { tre_cint_t min = 0, max = 0; + wchar_t wc; + int clen = mbtowc(&wc, re, -1); + + if (clen<0) clen=1, wc=WEOF; class = (tre_ctype_t)0; - if (re + 2 < ctx->re_end - && *(re + 1) == '-' && *(re + 2) != ']') + if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET) { - DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", - ctx->re_end - re, re)); - min = *re; - max = *(re + 2); - re += 3; + min = wc; + re += clen+1; + clen = mbtowc(&wc, re, -1); + if (clen<0) clen=1, wc=WEOF; + max = wc; + re += clen; /* XXX - Should use collation order instead of encoding values in character ranges. */ if (min > max) status = REG_ERANGE; } - else if (re + 1 < ctx->re_end - && *re == '[' && *(re + 1) == '.') + else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD) status = REG_ECOLLATE; - else if (re + 1 < ctx->re_end - && *re == '[' && *(re + 1) == '=') + else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL) status = REG_ECOLLATE; - else if (re + 1 < ctx->re_end - && *re == '[' && *(re + 1) == ':') + else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON) { char tmp_str[64]; - const tre_char_t *endptr = re + 2; + const char *endptr = re + 2; int len; - DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", - ctx->re_end - re, re)); - while (endptr < ctx->re_end && *endptr != ':') + while (*endptr && *endptr != CHAR_COLON) endptr++; - if (endptr != ctx->re_end) + if (*endptr) { len = MIN(endptr - re - 2, 63); -#ifdef TRE_WCHAR - { - tre_char_t tmp_wcs[64]; - wcsncpy(tmp_wcs, re + 2, len); - tmp_wcs[len] = '\0'; -#if defined HAVE_WCSRTOMBS - { - mbstate_t state; - const tre_char_t *src = tmp_wcs; - memset(&state, '\0', sizeof(state)); - len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state); - } -#elif defined HAVE_WCSTOMBS - len = wcstombs(tmp_str, tmp_wcs, 63); -#endif /* defined HAVE_WCSTOMBS */ - } -#else /* !TRE_WCHAR */ strncpy(tmp_str, re + 2, len); -#endif /* !TRE_WCHAR */ tmp_str[len] = '\0'; - DPRINT((" class name: %s\n", tmp_str)); class = tre_ctype(tmp_str); if (!class) status = REG_ECTYPE; - /* Optimize character classes for 8 bit character sets. */ - if (status == REG_OK && TRE_MB_CUR_MAX == 1) - { - status = tre_expand_ctype(ctx->mem, class, items, - &i, &max_i, ctx->cflags); - class = (tre_ctype_t)0; - skip = 1; - } re = endptr + 2; } else @@ -543,13 +631,12 @@ tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, } else { - DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", - ctx->re_end - re, re)); - if (*re == '-' && *(re + 1) != ']' + if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET && ctx->re != re) /* Two ranges are not allowed to share and endpoint. */ status = REG_ERANGE; - min = max = *re++; + min = max = wc; + re += clen; } if (status != REG_OK) @@ -565,16 +652,15 @@ tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, status = tre_new_item(ctx->mem, min, max, &i, &max_i, items); if (status != REG_OK) break; - ((tre_literal_t*)((*items)[i-1])->obj)->class = class; + ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class; } /* Add opposite-case counterpoints if REG_ICASE is present. This is broken if there are more than two "same" characters. */ if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip) { - int cmin, ccurr; + tre_cint_t cmin, ccurr; - DPRINT(("adding opposite-case counterpoints\n")); while (min <= max) { if (tre_islower(min)) @@ -626,10 +712,8 @@ tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) if (items == NULL) return REG_ESPACE; - if (*ctx->re == '^') + if (*ctx->re == CHAR_CARET) { - DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); negate = 1; ctx->re++; } @@ -642,7 +726,7 @@ tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) /* Sort the array if we need to negate it. */ if (negate) - qsort(items, i, sizeof(*items), tre_compare_items); + qsort(items, (unsigned)i, sizeof(*items), tre_compare_items); curr_max = curr_min = 0; /* Build a union of the items in the array, negated if necessary. */ @@ -653,16 +737,12 @@ tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) min = l->code_min; max = l->code_max; - DPRINT(("item: %d - %d, class %ld, curr_max = %d\n", - (int)l->code_min, (int)l->code_max, (long)l->class, curr_max)); - if (negate) { if (min < curr_max) { /* Overlap. */ curr_max = MAX(max + 1, curr_max); - DPRINT(("overlap, curr_max = %d\n", curr_max)); l = NULL; } else @@ -671,13 +751,11 @@ tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) curr_max = min - 1; if (curr_max >= curr_min) { - DPRINT(("no overlap\n")); l->code_min = curr_min; l->code_max = curr_max; } else { - DPRINT(("no overlap, zero room\n")); l = NULL; } curr_min = curr_max = max + 1; @@ -687,7 +765,6 @@ tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) if (l != NULL) { int k; - DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max)); l->position = ctx->position; if (num_neg_classes > 0) { @@ -723,7 +800,6 @@ tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) if (negate) { int k; - DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX)); n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position); if (n == NULL) status = REG_ESPACE; @@ -776,11 +852,11 @@ tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) /* Parses a positive decimal integer. Returns -1 if the string does not contain a valid number. */ static int -tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end) +tre_parse_int(const char **regex) { int num = -1; - const tre_char_t *r = *regex; - while (r < regex_end && *r >= '0' && *r <= '9') + const char *r = *regex; + while (*r-'0'<10U) { if (num < 0) num = 0; @@ -796,67 +872,29 @@ static reg_errcode_t tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) { int min, max; - const tre_char_t *r = ctx->re; - const tre_char_t *start; - int counts_set = 0; + const char *r = ctx->re; + int minimal = 0; /* Parse number (minimum repetition count). */ min = -1; - if (r < ctx->re_end && *r >= '0' && *r <= '9') { - DPRINT(("tre_parse: min count: '%.*" STRF "'\n", ctx->re_end - r, r)); - min = tre_parse_int(&r, ctx->re_end); + if (*r >= '0' && *r <= '9') { + min = tre_parse_int(&r); } /* Parse comma and second number (maximum repetition count). */ max = min; - if (r < ctx->re_end && *r == ',') + if (*r == CHAR_COMMA) { r++; - DPRINT(("tre_parse: max count: '%.*" STRF "'\n", ctx->re_end - r, r)); - max = tre_parse_int(&r, ctx->re_end); + max = tre_parse_int(&r); } /* Check that the repeat counts are sane. */ if ((max >= 0 && min > max) || max > RE_DUP_MAX) return REG_BADBR; - - /* - '{' - optionally followed immediately by a number == minimum repcount - optionally followed by , then a number == maximum repcount - */ - - - do { - int done; - start = r; - - /* Parse count limit settings */ - done = 0; - if (!counts_set) - while (r + 1 < ctx->re_end && !done) - { - switch (*r) - { - case ',': - r++; - break; - case ' ': - r++; - break; - case '}': - done = 1; - break; - default: - done = 1; - break; - } - } - } while (start != r); - /* Missing }. */ - if (r >= ctx->re_end) + if (!*r) return REG_EBRACE; /* Empty contents of {}. */ @@ -866,20 +904,17 @@ tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) /* Parse the ending '}' or '\}'.*/ if (ctx->cflags & REG_EXTENDED) { - if (r >= ctx->re_end || *r != '}') + if (*r != CHAR_RBRACE) return REG_BADBR; r++; } else { - if (r + 1 >= ctx->re_end - || *r != '\\' - || *(r + 1) != '}') + if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE) return REG_BADBR; r += 2; } - /* Create the AST node(s). */ if (min == 0 && max == 0) { @@ -893,7 +928,7 @@ tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) /* Only approximate parameters set, no repetitions. */ min = max = 1; - *result = tre_ast_new_iter(ctx->mem, *result, min, max); + *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); if (!*result) return REG_ESPACE; } @@ -927,19 +962,14 @@ tre_parse(tre_parse_ctx_t *ctx) int bottom = tre_stack_num_objects(stack); int depth = 0; - DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n", - ctx->len, ctx->re, ctx->len)); - if (!ctx->nofirstsub) { - STACK_PUSH(stack, ctx->re); - STACK_PUSH(stack, ctx->submatch_id); - STACK_PUSH(stack, PARSE_MARK_FOR_SUBMATCH); + STACK_PUSH(stack, int, ctx->submatch_id); + STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH); ctx->submatch_id++; } - STACK_PUSH(stack, PARSE_RE); + STACK_PUSH(stack, int, PARSE_RE); ctx->re_start = ctx->re; - ctx->re_end = ctx->re + ctx->len; /* The following is basically just a recursive descent parser. I use @@ -950,67 +980,67 @@ tre_parse(tre_parse_ctx_t *ctx) { if (status != REG_OK) break; - symbol = (tre_parse_re_stack_symbol_t)tre_stack_pop(stack); + symbol = tre_stack_pop_int(stack); switch (symbol) { case PARSE_RE: /* Parse a full regexp. A regexp is one or more branches, separated by the union operator `|'. */ if (ctx->cflags & REG_EXTENDED) - STACK_PUSHX(stack, PARSE_UNION); - STACK_PUSHX(stack, PARSE_BRANCH); + STACK_PUSHX(stack, int, PARSE_UNION); + STACK_PUSHX(stack, int, PARSE_BRANCH); break; case PARSE_BRANCH: /* Parse a branch. A branch is one or more pieces, concatenated. A piece is an atom possibly followed by a postfix operator. */ - STACK_PUSHX(stack, PARSE_CATENATION); - STACK_PUSHX(stack, PARSE_PIECE); + STACK_PUSHX(stack, int, PARSE_CATENATION); + STACK_PUSHX(stack, int, PARSE_PIECE); break; case PARSE_PIECE: /* Parse a piece. A piece is an atom possibly followed by one or more postfix operators. */ - STACK_PUSHX(stack, PARSE_POSTFIX); - STACK_PUSHX(stack, PARSE_ATOM); + STACK_PUSHX(stack, int, PARSE_POSTFIX); + STACK_PUSHX(stack, int, PARSE_ATOM); break; case PARSE_CATENATION: /* If the expression has not ended, parse another piece. */ { tre_char_t c; - if (ctx->re >= ctx->re_end) + if (!*ctx->re) break; c = *ctx->re; - if (ctx->cflags & REG_EXTENDED && c == '|') - break; - if ((ctx->cflags & REG_EXTENDED - && c == ')' && depth > 0) - || (!(ctx->cflags & REG_EXTENDED) - && (c == '\\' - && *(ctx->re + 1) == ')'))) + if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) + break; + if ((ctx->cflags & REG_EXTENDED + && c == CHAR_RPAREN && depth > 0) + || (!(ctx->cflags & REG_EXTENDED) + && (c == CHAR_BACKSLASH + && *(ctx->re + 1) == CHAR_RPAREN))) + { + if (!(ctx->cflags & REG_EXTENDED) && depth == 0) + status = REG_EPAREN; + depth--; + if (!(ctx->cflags & REG_EXTENDED)) + ctx->re += 2; + break; + } + { - if (!(ctx->cflags & REG_EXTENDED) && depth == 0) - status = REG_EPAREN; - DPRINT(("tre_parse: group end: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - depth--; - if (!(ctx->cflags & REG_EXTENDED)) - ctx->re += 2; - break; + /* Default case, left associative concatenation. */ + STACK_PUSHX(stack, int, PARSE_CATENATION); + STACK_PUSHX(stack, voidptr, result); + STACK_PUSHX(stack, int, PARSE_POST_CATENATION); + STACK_PUSHX(stack, int, PARSE_PIECE); } - - /* Left associative concatenation. */ - STACK_PUSHX(stack, PARSE_CATENATION); - STACK_PUSHX(stack, result); - STACK_PUSHX(stack, PARSE_POST_CATENATION); - STACK_PUSHX(stack, PARSE_PIECE); break; } case PARSE_POST_CATENATION: { - tre_ast_node_t *tree = tre_stack_pop(stack); + tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); tre_ast_node_t *tmp_node; tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); if (!tmp_node) @@ -1020,21 +1050,19 @@ tre_parse(tre_parse_ctx_t *ctx) } case PARSE_UNION: - if (ctx->re >= ctx->re_end) + if (!*ctx->re) break; switch (*ctx->re) { - case '|': - DPRINT(("tre_parse: union: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - STACK_PUSHX(stack, PARSE_UNION); - STACK_PUSHX(stack, result); - STACK_PUSHX(stack, PARSE_POST_UNION); - STACK_PUSHX(stack, PARSE_BRANCH); + case CHAR_PIPE: + STACK_PUSHX(stack, int, PARSE_UNION); + STACK_PUSHX(stack, voidptr, result); + STACK_PUSHX(stack, int, PARSE_POST_UNION); + STACK_PUSHX(stack, int, PARSE_BRANCH); ctx->re++; break; - case ')': + case CHAR_RPAREN: ctx->re++; break; @@ -1046,7 +1074,7 @@ tre_parse(tre_parse_ctx_t *ctx) case PARSE_POST_UNION: { tre_ast_node_t *tmp_node; - tre_ast_node_t *tree = tre_stack_pop(stack); + tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); tmp_node = tre_ast_new_union(ctx->mem, tree, result); if (!tmp_node) return REG_ESPACE; @@ -1056,38 +1084,55 @@ tre_parse(tre_parse_ctx_t *ctx) case PARSE_POSTFIX: /* Parse postfix operators. */ - if (ctx->re >= ctx->re_end) + if (!*ctx->re) break; switch (*ctx->re) { - case '+': - case '?': + case CHAR_PLUS: + case CHAR_QUESTIONMARK: if (!(ctx->cflags & REG_EXTENDED)) - break; - case '*': + break; + /*FALLTHROUGH*/ + case CHAR_STAR: { tre_ast_node_t *tmp_node; + int minimal = 0; int rep_min = 0; int rep_max = -1; - if (*ctx->re == '+') + + if (*ctx->re == CHAR_PLUS) rep_min = 1; - if (*ctx->re == '?') + if (*ctx->re == CHAR_QUESTIONMARK) rep_max = 1; + { + if (*(ctx->re + 1) == CHAR_QUESTIONMARK) + { + minimal = 1; + ctx->re++; + } + else if (*(ctx->re + 1) == CHAR_STAR + || *(ctx->re + 1) == CHAR_PLUS) + { + /* These are reserved for future extensions. */ + return REG_BADRPT; + } + } + ctx->re++; - tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max); + tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max, + minimal); if (tmp_node == NULL) return REG_ESPACE; result = tmp_node; - STACK_PUSHX(stack, PARSE_POSTFIX); - break; + STACK_PUSHX(stack, int, PARSE_POSTFIX); } + break; - case '\\': + case CHAR_BACKSLASH: /* "\{" is special without REG_EXTENDED */ if (!(ctx->cflags & REG_EXTENDED) - && ctx->re + 1 < ctx->re_end - && *(ctx->re + 1) == '{') + && *(ctx->re + 1) == CHAR_LBRACE) { ctx->re++; goto parse_brace; @@ -1095,20 +1140,18 @@ tre_parse(tre_parse_ctx_t *ctx) else break; - case '{': + case CHAR_LBRACE: /* "{" is literal without REG_EXTENDED */ if (!(ctx->cflags & REG_EXTENDED)) break; parse_brace: - DPRINT(("tre_parse: bound: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); ctx->re++; status = tre_parse_bound(ctx, &result); if (status != REG_OK) return status; - STACK_PUSHX(stack, PARSE_POSTFIX); + STACK_PUSHX(stack, int, PARSE_POSTFIX); break; } break; @@ -1119,29 +1162,25 @@ tre_parse(tre_parse_ctx_t *ctx) a `\' followed by a character, or a single character. */ /* End of regexp? (empty string). */ - if (ctx->re >= ctx->re_end) + if (!*ctx->re) goto parse_literal; switch (*ctx->re) { - case '(': /* parenthesized subexpression */ + case CHAR_LPAREN: /* parenthesized subexpression */ if (ctx->cflags & REG_EXTENDED || (ctx->re > ctx->re_start - && *(ctx->re - 1) == '\\')) + && *(ctx->re - 1) == CHAR_BACKSLASH)) { depth++; { - DPRINT(("tre_parse: group begin: '%.*" STRF - "', submatch %d\n", - ctx->re_end - ctx->re, ctx->re, - ctx->submatch_id)); ctx->re++; /* First parse a whole RE, then mark the resulting tree for submatching. */ - STACK_PUSHX(stack, ctx->submatch_id); - STACK_PUSHX(stack, PARSE_MARK_FOR_SUBMATCH); - STACK_PUSHX(stack, PARSE_RE); + STACK_PUSHX(stack, int, ctx->submatch_id); + STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); + STACK_PUSHX(stack, int, PARSE_RE); ctx->submatch_id++; } } @@ -1149,13 +1188,11 @@ tre_parse(tre_parse_ctx_t *ctx) goto parse_literal; break; - case ')': /* end of current subexpression */ + case CHAR_RPAREN: /* end of current subexpression */ if ((ctx->cflags & REG_EXTENDED && depth > 0) || (ctx->re > ctx->re_start - && *(ctx->re - 1) == '\\')) + && *(ctx->re - 1) == CHAR_BACKSLASH)) { - DPRINT(("tre_parse: empty: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); /* We were expecting an atom, but instead the current subexpression was closed. POSIX leaves the meaning of this to be implementation-defined. We interpret this as @@ -1170,44 +1207,130 @@ tre_parse(tre_parse_ctx_t *ctx) goto parse_literal; break; - case '[': /* bracket expression */ - DPRINT(("tre_parse: bracket: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); + case CHAR_LBRACKET: /* bracket expression */ ctx->re++; status = tre_parse_bracket(ctx, &result); if (status != REG_OK) return status; break; - case '\\': + case CHAR_BACKSLASH: /* If this is "\(" or "\)" chew off the backslash and try again. */ if (!(ctx->cflags & REG_EXTENDED) - && ctx->re + 1 < ctx->re_end - && (*(ctx->re + 1) == '(' - || *(ctx->re + 1) == ')')) + && (*(ctx->re + 1) == CHAR_LPAREN + || *(ctx->re + 1) == CHAR_RPAREN)) { ctx->re++; - STACK_PUSHX(stack, PARSE_ATOM); + STACK_PUSHX(stack, int, PARSE_ATOM); break; } - if (ctx->re + 1 >= ctx->re_end) + /* If a macro is used, parse the expanded macro recursively. */ + { + const char *buf = tre_expand_macro(ctx->re + 1); + if (buf) + { + tre_parse_ctx_t subctx; + memcpy(&subctx, ctx, sizeof(subctx)); + subctx.re = buf; + subctx.nofirstsub = 1; + status = tre_parse(&subctx); + if (status != REG_OK) + return status; + ctx->re += 2; + ctx->position = subctx.position; + result = subctx.result; + break; + } + } + + if (!*ctx->re) /* Trailing backslash. */ return REG_EESCAPE; - DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); ctx->re++; switch (*ctx->re) { + case 'b': + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_WB, -1); + ctx->re++; + break; + case 'B': + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_WB_NEG, -1); + ctx->re++; + break; + case '<': + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_BOW, -1); + ctx->re++; + break; + case '>': + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_EOW, -1); + ctx->re++; + break; + case 'x': + ctx->re++; + if (ctx->re[0] != CHAR_LBRACE) + { + /* 8 bit hex char. */ + char tmp[3] = {0, 0, 0}; + long val; + + if (tre_isxdigit(ctx->re[0])) + { + tmp[0] = (char)ctx->re[0]; + ctx->re++; + } + if (tre_isxdigit(ctx->re[0])) + { + tmp[1] = (char)ctx->re[0]; + ctx->re++; + } + val = strtol(tmp, NULL, 16); + result = tre_ast_new_literal(ctx->mem, (int)val, + (int)val, ctx->position); + ctx->position++; + break; + } + else if (*ctx->re) + { + /* Wide char. */ + char tmp[32]; + long val; + int i = 0; + ctx->re++; + while (*ctx->re && i < sizeof tmp) + { + if (ctx->re[0] == CHAR_RBRACE) + break; + if (tre_isxdigit(ctx->re[0])) + { + tmp[i] = (char)ctx->re[0]; + i++; + ctx->re++; + continue; + } + return REG_EBRACE; + } + ctx->re++; + tmp[i] = 0; + val = strtol(tmp, NULL, 16); + result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, + ctx->position); + ctx->position++; + break; + } + /*FALLTHROUGH*/ + default: - if (!(ctx->cflags & REG_EXTENDED) && tre_isdigit(*ctx->re)) + if (tre_isdigit(*ctx->re)) { /* Back reference. */ int val = *ctx->re - '0'; - DPRINT(("tre_parse: backref: '%.*" STRF "'\n", - ctx->re_end - ctx->re + 1, ctx->re - 1)); result = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position); if (result == NULL) @@ -1219,8 +1342,6 @@ tre_parse(tre_parse_ctx_t *ctx) else { /* Escaped character. */ - DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", - ctx->re_end - ctx->re + 1, ctx->re - 1)); result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, ctx->position); ctx->position++; @@ -1232,9 +1353,7 @@ tre_parse(tre_parse_ctx_t *ctx) return REG_ESPACE; break; - case '.': /* the any-symbol */ - DPRINT(("tre_parse: any: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); + case CHAR_PERIOD: /* the any-symbol */ if (ctx->cflags & REG_NEWLINE) { tre_ast_node_t *tmp1; @@ -1263,17 +1382,15 @@ tre_parse(tre_parse_ctx_t *ctx) ctx->re++; break; - case '^': /* beginning of line assertion */ + case CHAR_CARET: /* beginning of line assertion */ /* '^' has a special meaning everywhere in EREs, and in the beginning of the RE and after \( is BREs. */ if (ctx->cflags & REG_EXTENDED || (ctx->re - 2 >= ctx->re_start - && *(ctx->re - 2) == '\\' - && *(ctx->re - 1) == '(') + && *(ctx->re - 2) == CHAR_BACKSLASH + && *(ctx->re - 1) == CHAR_LPAREN) || ctx->re == ctx->re_start) { - DPRINT(("tre_parse: BOL: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); result = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1); if (result == NULL) @@ -1284,17 +1401,14 @@ tre_parse(tre_parse_ctx_t *ctx) goto parse_literal; break; - case '$': /* end of line assertion. */ + case CHAR_DOLLAR: /* end of line assertion. */ /* '$' is special everywhere in EREs, and in the end of the string and before \) is BREs. */ if (ctx->cflags & REG_EXTENDED - || (ctx->re + 2 < ctx->re_end - && *(ctx->re + 1) == '\\' - && *(ctx->re + 2) == ')') - || ctx->re + 1 == ctx->re_end) + || (*(ctx->re + 1) == CHAR_BACKSLASH + && *(ctx->re + 2) == CHAR_RPAREN) + || !*(ctx->re + 1)) { - DPRINT(("tre_parse: EOL: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); result = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1); if (result == NULL) @@ -1312,34 +1426,34 @@ tre_parse(tre_parse_ctx_t *ctx) regexp ends here, we interpret it as an empty expression (which matches an empty string). */ if ( - (ctx->re >= ctx->re_end - || *ctx->re == '*' + (!*ctx->re + || *ctx->re == CHAR_STAR || (ctx->cflags & REG_EXTENDED - && (*ctx->re == '|' - || *ctx->re == '{' - || *ctx->re == '+' - || *ctx->re == '?')) + && (*ctx->re == CHAR_PIPE + || *ctx->re == CHAR_LBRACE + || *ctx->re == CHAR_PLUS + || *ctx->re == CHAR_QUESTIONMARK)) /* Test for "\)" in BRE mode. */ || (!(ctx->cflags & REG_EXTENDED) - && ctx->re + 1 < ctx->re_end - && *ctx->re == '\\' - && *(ctx->re + 1) == '{'))) + && !*(ctx->re + 1) + && *ctx->re == CHAR_BACKSLASH + && *(ctx->re + 1) == CHAR_LBRACE))) { - DPRINT(("tre_parse: empty: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); if (!result) return REG_ESPACE; break; } - DPRINT(("tre_parse: literal: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); + wchar_t wc; + int clen = mbtowc(&wc, ctx->re, -1); + if (clen<0) clen=1, wc=WEOF; + /* Note that we can't use an tre_isalpha() test here, since there may be characters which are alphabetic but neither upper or lower case. */ if (ctx->cflags & REG_ICASE - && (tre_isupper(*ctx->re) || tre_islower(*ctx->re))) + && (tre_isupper(wc) || tre_islower(wc))) { tre_ast_node_t *tmp1; tre_ast_node_t *tmp2; @@ -1352,13 +1466,13 @@ tre_parse(tre_parse_ctx_t *ctx) that at least for multi-character collating elements there could be several opposite-case counterpoints, but they cannot be supported portably anyway. */ - tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re), - tre_toupper(*ctx->re), + tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), + tre_toupper(wc), ctx->position); if (!tmp1) return REG_ESPACE; - tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re), - tre_tolower(*ctx->re), + tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), + tre_tolower(wc), ctx->position); if (!tmp2) return REG_ESPACE; @@ -1368,20 +1482,20 @@ tre_parse(tre_parse_ctx_t *ctx) } else { - result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, + result = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position); if (!result) return REG_ESPACE; } ctx->position++; - ctx->re++; + ctx->re += clen; break; } break; case PARSE_MARK_FOR_SUBMATCH: { - int submatch_id = (int)tre_stack_pop(stack); + int submatch_id = tre_stack_pop_int(stack); if (result->submatch_id >= 0) { @@ -1401,7 +1515,11 @@ tre_parse(tre_parse_ctx_t *ctx) } case PARSE_RESTORE_CFLAGS: - ctx->cflags = (int)tre_stack_pop(stack); + ctx->cflags = tre_stack_pop_int(stack); + break; + + default: + assert(0); break; } } @@ -1417,10 +1535,18 @@ tre_parse(tre_parse_ctx_t *ctx) } + /*********************************************************************** from tre-compile.c ***********************************************************************/ + +/* + TODO: + - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive + function calls. +*/ + /* Algorithms to setup tags so that submatch addressing can be done. */ @@ -1429,42 +1555,60 @@ tre_parse(tre_parse_ctx_t *ctx) /* Inserts a catenation node to the root of the tree given in `node'. As the left child a new tag with number `tag_id' to `node' is added, and the right child is the old root. */ -/* OR */ +static reg_errcode_t +tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id) +{ + tre_catenation_t *c; + + c = tre_mem_alloc(mem, sizeof(*c)); + if (c == NULL) + return REG_ESPACE; + c->left = tre_ast_new_literal(mem, TAG, tag_id, -1); + if (c->left == NULL) + return REG_ESPACE; + c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); + if (c->right == NULL) + return REG_ESPACE; + + c->right->obj = node->obj; + c->right->type = node->type; + c->right->nullable = -1; + c->right->submatch_id = -1; + c->right->firstpos = NULL; + c->right->lastpos = NULL; + c->right->num_tags = 0; + node->obj = c; + node->type = CATENATION; + return REG_OK; +} + /* Inserts a catenation node to the root of the tree given in `node'. As the right child a new tag with number `tag_id' to `node' is added, and the left child is the old root. */ static reg_errcode_t -tre_add_tag(tre_mem_t mem, tre_ast_node_t *node, int tag_id, int right) +tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id) { tre_catenation_t *c; - tre_ast_node_t *child_tag, *child_old; - - DPRINT(("add_tag_%s: tag %d\n", right ? "right" : "left", tag_id)); c = tre_mem_alloc(mem, sizeof(*c)); if (c == NULL) return REG_ESPACE; - child_tag = tre_ast_new_literal(mem, TAG, tag_id, -1); - if (child_tag == NULL) + c->right = tre_ast_new_literal(mem, TAG, tag_id, -1); + if (c->right == NULL) return REG_ESPACE; - child_old = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); - if (child_old == NULL) + c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); + if (c->left == NULL) return REG_ESPACE; - child_old->obj = node->obj; - child_old->type = node->type; - child_old->nullable = -1; - child_old->submatch_id = -1; - child_old->firstpos = NULL; - child_old->lastpos = NULL; - child_old->num_tags = 0; + c->left->obj = node->obj; + c->left->type = node->type; + c->left->nullable = -1; + c->left->submatch_id = -1; + c->left->firstpos = NULL; + c->left->lastpos = NULL; + c->left->num_tags = 0; node->obj = c; node->type = CATENATION; - - c->right = c->left = child_old; - if (right) c->right = child_tag; - else c->left = child_tag; - return REG_OK; } @@ -1484,6 +1628,27 @@ typedef struct { int next_tag; } tre_tag_states_t; + +/* Go through `regset' and set submatch data for submatches that are + using this tag. */ +static void +tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag) +{ + int i; + + for (i = 0; regset[i] >= 0; i++) + { + int id = regset[i] / 2; + int start = !(regset[i] % 2); + if (start) + tnfa->submatch_data[id].so_tag = tag; + else + tnfa->submatch_data[id].eo_tag = tag; + } + regset[0] = -1; +} + + /* Adds tags to appropriate locations in the parse tree in `tree', so that subexpressions marked for submatch addressing can be traced. */ static reg_errcode_t @@ -1498,15 +1663,20 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, int first_pass = (mem == NULL || tnfa == NULL); int *regset, *orig_regset; int num_tags = 0; /* Total number of tags. */ + int num_minimals = 0; /* Number of special minimal tags. */ int tag = 0; /* The tag that is to be added next. */ int next_tag = 1; /* Next tag to use after this one. */ int *parents; /* Stack of submatches the current submatch is contained in. */ + int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */ tre_tag_states_t *saved_states; tre_tag_direction_t direction = TRE_TAG_MINIMIZE; if (!first_pass) + { tnfa->end_tag = 0; + tnfa->minimal_tags[0] = -1; + } regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2)); if (regset == NULL) @@ -1536,21 +1706,21 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, saved_states[i].tag = -1; } - STACK_PUSH(stack, node); - STACK_PUSH(stack, ADDTAGS_RECURSE); + STACK_PUSH(stack, voidptr, node); + STACK_PUSH(stack, int, ADDTAGS_RECURSE); while (tre_stack_num_objects(stack) > bottom) { if (status != REG_OK) break; - symbol = (tre_addtags_symbol_t)tre_stack_pop(stack); + symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack); switch (symbol) { case ADDTAGS_SET_SUBMATCH_END: { - int id = (int)tre_stack_pop(stack); + int id = tre_stack_pop_int(stack); int i; /* Add end of this submatch to regset. */ @@ -1565,7 +1735,7 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, } case ADDTAGS_RECURSE: - node = tre_stack_pop(stack); + node = tre_stack_pop_voidptr(stack); if (node->submatch_id >= 0) { @@ -1600,8 +1770,8 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, /* Add end of this submatch to regset after processing this node. */ - STACK_PUSHX(stack, node->submatch_id); - STACK_PUSHX(stack, ADDTAGS_SET_SUBMATCH_END); + STACK_PUSHX(stack, int, node->submatch_id); + STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END); } switch (node->type) @@ -1613,38 +1783,30 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) { int i; - DPRINT(("Literal %d-%d\n", - (int)lit->code_min, (int)lit->code_max)); if (regset[0] >= 0) { /* Regset is not empty, so add a tag before the literal or backref. */ if (!first_pass) { - status = tre_add_tag(mem, node, tag, 0 /*left*/); + status = tre_add_tag_left(mem, node, tag); tnfa->tag_directions[tag] = direction; - /* Go through the regset and set submatch data for - submatches that are using this tag. */ - for (i = 0; regset[i] >= 0; i++) + if (minimal_tag >= 0) { - int id = regset[i] >> 1; - int start = !(regset[i] & 1); - DPRINT((" Using tag %d for %s offset of " - "submatch %d\n", tag, - start ? "start" : "end", id)); - if (start) - tnfa->submatch_data[id].so_tag = tag; - else - tnfa->submatch_data[id].eo_tag = tag; + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + minimal_tag = -1; + num_minimals++; } + tre_purge_regset(regset, tnfa, tag); } else { - DPRINT((" num_tags = 1\n")); node->num_tags = 1; } - DPRINT((" num_tags++\n")); regset[0] = -1; tag = next_tag; num_tags++; @@ -1663,82 +1825,75 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, tre_ast_node_t *left = cat->left; tre_ast_node_t *right = cat->right; int reserved_tag = -1; - DPRINT(("Catenation, next_tag = %d\n", next_tag)); /* After processing right child. */ - STACK_PUSHX(stack, node); - STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_RIGHT); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT); /* Process right child. */ - STACK_PUSHX(stack, right); - STACK_PUSHX(stack, ADDTAGS_RECURSE); + STACK_PUSHX(stack, voidptr, right); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); /* After processing left child. */ - STACK_PUSHX(stack, next_tag + left->num_tags); - DPRINT((" Pushing %d for after left\n", - next_tag + left->num_tags)); + STACK_PUSHX(stack, int, next_tag + left->num_tags); if (left->num_tags > 0 && right->num_tags > 0) { /* Reserve the next tag to the right child. */ - DPRINT((" Reserving next_tag %d to right child\n", - next_tag)); reserved_tag = next_tag; next_tag++; } - STACK_PUSHX(stack, reserved_tag); - STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_LEFT); + STACK_PUSHX(stack, int, reserved_tag); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT); /* Process left child. */ - STACK_PUSHX(stack, left); - STACK_PUSHX(stack, ADDTAGS_RECURSE); + STACK_PUSHX(stack, voidptr, left); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); } break; case ITERATION: { tre_iteration_t *iter = node->obj; - DPRINT(("Iteration\n")); if (first_pass) { - STACK_PUSHX(stack, regset[0] >= 0); + STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal); } else { - STACK_PUSHX(stack, tag); + STACK_PUSHX(stack, int, tag); + STACK_PUSHX(stack, int, iter->minimal); } - STACK_PUSHX(stack, node); - STACK_PUSHX(stack, ADDTAGS_AFTER_ITERATION); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION); - STACK_PUSHX(stack, iter->arg); - STACK_PUSHX(stack, ADDTAGS_RECURSE); + STACK_PUSHX(stack, voidptr, iter->arg); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); /* Regset is not empty, so add a tag here. */ - if (regset[0] >= 0) + if (regset[0] >= 0 || iter->minimal) { if (!first_pass) { int i; - status = tre_add_tag(mem, node, tag, 0 /*left*/); - tnfa->tag_directions[tag] = direction; - /* Go through the regset and set submatch data for - submatches that are using this tag. */ - for (i = 0; regset[i] >= 0; i++) + status = tre_add_tag_left(mem, node, tag); + if (iter->minimal) + tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE; + else + tnfa->tag_directions[tag] = direction; + if (minimal_tag >= 0) { - int id = regset[i] >> 1; - int start = !(regset[i] & 1); - DPRINT((" Using tag %d for %s offset of " - "submatch %d\n", tag, - start ? "start" : "end", id)); - if (start) - tnfa->submatch_data[id].so_tag = tag; - else - tnfa->submatch_data[id].eo_tag = tag; + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + minimal_tag = -1; + num_minimals++; } + tre_purge_regset(regset, tnfa, tag); } - DPRINT((" num_tags++\n")); regset[0] = -1; tag = next_tag; num_tags++; @@ -1766,28 +1921,26 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, right_tag = next_tag; } - DPRINT(("Union\n")); - /* After processing right child. */ - STACK_PUSHX(stack, right_tag); - STACK_PUSHX(stack, left_tag); - STACK_PUSHX(stack, regset); - STACK_PUSHX(stack, regset[0] >= 0); - STACK_PUSHX(stack, node); - STACK_PUSHX(stack, right); - STACK_PUSHX(stack, left); - STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_RIGHT); + STACK_PUSHX(stack, int, right_tag); + STACK_PUSHX(stack, int, left_tag); + STACK_PUSHX(stack, voidptr, regset); + STACK_PUSHX(stack, int, regset[0] >= 0); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, voidptr, right); + STACK_PUSHX(stack, voidptr, left); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT); /* Process right child. */ - STACK_PUSHX(stack, right); - STACK_PUSHX(stack, ADDTAGS_RECURSE); + STACK_PUSHX(stack, voidptr, right); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); /* After processing left child. */ - STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_LEFT); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT); /* Process left child. */ - STACK_PUSHX(stack, left); - STACK_PUSHX(stack, ADDTAGS_RECURSE); + STACK_PUSHX(stack, voidptr, left); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); /* Regset is not empty, so add a tag here. */ if (regset[0] >= 0) @@ -1795,25 +1948,20 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, if (!first_pass) { int i; - status = tre_add_tag(mem, node, tag, 0 /*left*/); + status = tre_add_tag_left(mem, node, tag); tnfa->tag_directions[tag] = direction; - /* Go through the regset and set submatch data for - submatches that are using this tag. */ - for (i = 0; regset[i] >= 0; i++) + if (minimal_tag >= 0) { - int id = regset[i] >> 1; - int start = !(regset[i] & 1); - DPRINT((" Using tag %d for %s offset of " - "submatch %d\n", tag, - start ? "start" : "end", id)); - if (start) - tnfa->submatch_data[id].so_tag = tag; - else - tnfa->submatch_data[id].eo_tag = tag; + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + minimal_tag = -1; + num_minimals++; } + tre_purge_regset(regset, tnfa, tag); } - DPRINT((" num_tags++\n")); regset[0] = -1; tag = next_tag; num_tags++; @@ -1845,43 +1993,52 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, case ADDTAGS_AFTER_ITERATION: { + int minimal = 0; int enter_tag; - node = tre_stack_pop(stack); + node = tre_stack_pop_voidptr(stack); if (first_pass) + { node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags - + (int)tre_stack_pop(stack); + + tre_stack_pop_int(stack); + minimal_tag = -1; + } else - enter_tag = (int)tre_stack_pop(stack); + { + minimal = tre_stack_pop_int(stack); + enter_tag = tre_stack_pop_int(stack); + if (minimal) + minimal_tag = enter_tag; + } - DPRINT(("After iteration\n")); - direction = TRE_TAG_MAXIMIZE; + if (!first_pass) + { + if (minimal) + direction = TRE_TAG_MINIMIZE; + else + direction = TRE_TAG_MAXIMIZE; + } break; } case ADDTAGS_AFTER_CAT_LEFT: { - int new_tag = (int)tre_stack_pop(stack); - next_tag = (int)tre_stack_pop(stack); - DPRINT(("After cat left, tag = %d, next_tag = %d\n", - tag, next_tag)); + int new_tag = tre_stack_pop_int(stack); + next_tag = tre_stack_pop_int(stack); if (new_tag >= 0) { - DPRINT((" Setting tag to %d\n", new_tag)); tag = new_tag; } break; } case ADDTAGS_AFTER_CAT_RIGHT: - DPRINT(("After cat right\n")); - node = tre_stack_pop(stack); + node = tre_stack_pop_voidptr(stack); if (first_pass) node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags + ((tre_catenation_t *)node->obj)->right->num_tags; break; case ADDTAGS_AFTER_UNION_LEFT: - DPRINT(("After union left\n")); /* Lift the bottom of the `regset' array so that when processing the right operand the items currently in the array are invisible. The original bottom was saved at ADDTAGS_UNION and @@ -1893,20 +2050,19 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, case ADDTAGS_AFTER_UNION_RIGHT: { int added_tags, tag_left, tag_right; - tre_ast_node_t *left = tre_stack_pop(stack); - tre_ast_node_t *right = tre_stack_pop(stack); - DPRINT(("After union right\n")); - node = tre_stack_pop(stack); - added_tags = (int)tre_stack_pop(stack); + tre_ast_node_t *left = tre_stack_pop_voidptr(stack); + tre_ast_node_t *right = tre_stack_pop_voidptr(stack); + node = tre_stack_pop_voidptr(stack); + added_tags = tre_stack_pop_int(stack); if (first_pass) { node->num_tags = ((tre_union_t *)node->obj)->left->num_tags + ((tre_union_t *)node->obj)->right->num_tags + added_tags + ((node->num_submatches > 0) ? 2 : 0); } - regset = tre_stack_pop(stack); - tag_left = (int)tre_stack_pop(stack); - tag_right = (int)tre_stack_pop(stack); + regset = tre_stack_pop_voidptr(stack); + tag_left = tre_stack_pop_int(stack); + tag_right = tre_stack_pop_int(stack); /* Add tags after both children, the left child gets a smaller tag than the right child. This guarantees that we prefer @@ -1921,12 +2077,11 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, { if (!first_pass) { - status = tre_add_tag(mem, left, tag_left, 1 /*right*/); - tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE; - status = tre_add_tag(mem, right, tag_right, 1 /*right*/); - tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE; + status = tre_add_tag_right(mem, left, tag_left); + tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE; + status = tre_add_tag_right(mem, right, tag_right); + tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE; } - DPRINT((" num_tags += 2\n")); num_tags += 2; } direction = TRE_TAG_MAXIMIZE; @@ -1941,30 +2096,23 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, } /* end while(tre_stack_num_objects(stack) > bottom) */ if (!first_pass) + tre_purge_regset(regset, tnfa, tag); + + if (!first_pass && minimal_tag >= 0) { int i; - /* Go through the regset and set submatch data for - submatches that are using this tag. */ - for (i = 0; regset[i] >= 0; i++) - { - int id = regset[i] >> 1; - int start = !(regset[i] & 1); - DPRINT((" Using tag %d for %s offset of " - "submatch %d\n", num_tags, - start ? "start" : "end", id)); - if (start) - tnfa->submatch_data[id].so_tag = num_tags; - else - tnfa->submatch_data[id].eo_tag = num_tags; - } + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + minimal_tag = -1; + num_minimals++; } - DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n", - first_pass? "First pass" : "Second pass", num_tags)); - assert(tree->num_tags == num_tags); tnfa->end_tag = num_tags; tnfa->num_tags = num_tags; + tnfa->num_minimals = num_minimals; xfree(orig_regset); xfree(parents); xfree(saved_states); @@ -1998,8 +2146,8 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, tre_ast_node_t **result = copy; tre_copyast_symbol_t symbol; - STACK_PUSH(stack, ast); - STACK_PUSH(stack, COPY_RECURSE); + STACK_PUSH(stack, voidptr, ast); + STACK_PUSH(stack, int, COPY_RECURSE); while (status == REG_OK && tre_stack_num_objects(stack) > bottom) { @@ -2007,14 +2155,14 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, if (status != REG_OK) break; - symbol = (tre_copyast_symbol_t)tre_stack_pop(stack); + symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack); switch (symbol) { case COPY_SET_RESULT_PTR: - result = tre_stack_pop(stack); + result = tre_stack_pop_voidptr(stack); break; case COPY_RECURSE: - node = tre_stack_pop(stack); + node = tre_stack_pop_voidptr(stack); switch (node->type) { case LITERAL: @@ -2055,52 +2203,53 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, case UNION: { tre_union_t *uni = node->obj; - tre_union_t *copy; + tre_union_t *tmp; *result = tre_ast_new_union(mem, uni->left, uni->right); if (*result == NULL) { status = REG_ESPACE; break; } - copy = (*result)->obj; - result = ©->left; - STACK_PUSHX(stack, uni->right); - STACK_PUSHX(stack, COPY_RECURSE); - STACK_PUSHX(stack, ©->right); - STACK_PUSHX(stack, COPY_SET_RESULT_PTR); - STACK_PUSHX(stack, uni->left); - STACK_PUSHX(stack, COPY_RECURSE); + tmp = (*result)->obj; + result = &tmp->left; + STACK_PUSHX(stack, voidptr, uni->right); + STACK_PUSHX(stack, int, COPY_RECURSE); + STACK_PUSHX(stack, voidptr, &tmp->right); + STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR); + STACK_PUSHX(stack, voidptr, uni->left); + STACK_PUSHX(stack, int, COPY_RECURSE); break; } case CATENATION: { tre_catenation_t *cat = node->obj; - tre_catenation_t *copy; + tre_catenation_t *tmp; *result = tre_ast_new_catenation(mem, cat->left, cat->right); if (*result == NULL) { status = REG_ESPACE; break; } - copy = (*result)->obj; - copy->left = NULL; - copy->right = NULL; - result = ©->left; - - STACK_PUSHX(stack, cat->right); - STACK_PUSHX(stack, COPY_RECURSE); - STACK_PUSHX(stack, ©->right); - STACK_PUSHX(stack, COPY_SET_RESULT_PTR); - STACK_PUSHX(stack, cat->left); - STACK_PUSHX(stack, COPY_RECURSE); + tmp = (*result)->obj; + tmp->left = NULL; + tmp->right = NULL; + result = &tmp->left; + + STACK_PUSHX(stack, voidptr, cat->right); + STACK_PUSHX(stack, int, COPY_RECURSE); + STACK_PUSHX(stack, voidptr, &tmp->right); + STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR); + STACK_PUSHX(stack, voidptr, cat->left); + STACK_PUSHX(stack, int, COPY_RECURSE); break; } case ITERATION: { tre_iteration_t *iter = node->obj; - STACK_PUSHX(stack, iter->arg); - STACK_PUSHX(stack, COPY_RECURSE); - *result = tre_ast_new_iter(mem, iter->arg, iter->min, iter->max); + STACK_PUSHX(stack, voidptr, iter->arg); + STACK_PUSHX(stack, int, COPY_RECURSE); + *result = tre_ast_new_iter(mem, iter->arg, iter->min, + iter->max, iter->minimal); if (*result == NULL) { status = REG_ESPACE; @@ -2138,11 +2287,10 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, int pos_add = 0; int pos_add_total = 0; int max_pos = 0; - /* Approximate parameter nesting level. */ int iter_depth = 0; - STACK_PUSHR(stack, ast); - STACK_PUSHR(stack, EXPAND_RECURSE); + STACK_PUSHR(stack, voidptr, ast); + STACK_PUSHR(stack, int, EXPAND_RECURSE); while (status == REG_OK && tre_stack_num_objects(stack) > bottom) { tre_ast_node_t *node; @@ -2151,10 +2299,8 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, if (status != REG_OK) break; - DPRINT(("pos_add %d\n", pos_add)); - - symbol = (tre_expand_ast_symbol_t)tre_stack_pop(stack); - node = tre_stack_pop(stack); + symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack); + node = tre_stack_pop_voidptr(stack); switch (symbol) { case EXPAND_RECURSE: @@ -2174,36 +2320,35 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, case UNION: { tre_union_t *uni = node->obj; - STACK_PUSHX(stack, uni->right); - STACK_PUSHX(stack, EXPAND_RECURSE); - STACK_PUSHX(stack, uni->left); - STACK_PUSHX(stack, EXPAND_RECURSE); + STACK_PUSHX(stack, voidptr, uni->right); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + STACK_PUSHX(stack, voidptr, uni->left); + STACK_PUSHX(stack, int, EXPAND_RECURSE); break; } case CATENATION: { tre_catenation_t *cat = node->obj; - STACK_PUSHX(stack, cat->right); - STACK_PUSHX(stack, EXPAND_RECURSE); - STACK_PUSHX(stack, cat->left); - STACK_PUSHX(stack, EXPAND_RECURSE); + STACK_PUSHX(stack, voidptr, cat->right); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + STACK_PUSHX(stack, voidptr, cat->left); + STACK_PUSHX(stack, int, EXPAND_RECURSE); break; } case ITERATION: { tre_iteration_t *iter = node->obj; - STACK_PUSHX(stack, pos_add); - STACK_PUSHX(stack, node); - STACK_PUSHX(stack, EXPAND_AFTER_ITER); - STACK_PUSHX(stack, iter->arg); - STACK_PUSHX(stack, EXPAND_RECURSE); + STACK_PUSHX(stack, int, pos_add); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, EXPAND_AFTER_ITER); + STACK_PUSHX(stack, voidptr, iter->arg); + STACK_PUSHX(stack, int, EXPAND_RECURSE); /* If we are going to expand this node at EXPAND_AFTER_ITER then don't increase the `pos' fields of the nodes now, it will get done when expanding. */ if (iter->min > 1 || iter->max > 1) pos_add = 0; iter_depth++; - DPRINT(("iter\n")); break; } default: @@ -2215,23 +2360,22 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, { tre_iteration_t *iter = node->obj; int pos_add_last; - pos_add = (int)tre_stack_pop(stack); + pos_add = tre_stack_pop_int(stack); pos_add_last = pos_add; if (iter->min > 1 || iter->max > 1) { tre_ast_node_t *seq1 = NULL, *seq2 = NULL; - int i; + int j; int pos_add_save = pos_add; /* Create a catenated sequence of copies of the node. */ - for (i = 0; i < iter->min; i++) + for (j = 0; j < iter->min; j++) { tre_ast_node_t *copy; /* Remove tags from all but the last copy. */ - int flags = ((i + 1 < iter->min) + int flags = ((j + 1 < iter->min) ? COPY_REMOVE_TAGS : COPY_MAXIMIZE_FIRST_TAG); - DPRINT((" pos_add %d\n", pos_add)); pos_add_save = pos_add; status = tre_copy_ast(mem, stack, iter->arg, flags, &pos_add, tag_directions, ©, @@ -2254,13 +2398,13 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, &pos_add, NULL, &seq2, &max_pos); if (status != REG_OK) return status; - seq2 = tre_ast_new_iter(mem, seq2, 0, -1); + seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0); if (seq2 == NULL) return REG_ESPACE; } else { - for (i = iter->min; i < iter->max; i++) + for (j = iter->min; j < iter->max; j++) { tre_ast_node_t *tmp, *copy; pos_add_save = pos_add; @@ -2316,12 +2460,6 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, if (max_pos > *position) *position = max_pos; -#ifdef TRE_DEBUG - DPRINT(("Expanded AST:\n")); - tre_ast_print(ast); - DPRINT(("*position %d, max_pos %d\n", *position, max_pos)); -#endif - return status; } @@ -2441,7 +2579,8 @@ tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2, set to the number of tags seen on the path. */ static reg_errcode_t tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, - int *assertions, int *num_tags_seen) + int *assertions, int *params, int *num_tags_seen, + int *params_seen) { tre_literal_t *lit; tre_union_t *uni; @@ -2453,12 +2592,12 @@ tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, if (num_tags_seen) *num_tags_seen = 0; - status = tre_stack_push(stack, node); + status = tre_stack_push_voidptr(stack, node); /* Walk through the tree recursively. */ while (status == REG_OK && tre_stack_num_objects(stack) > bottom) { - node = tre_stack_pop(stack); + node = tre_stack_pop_voidptr(stack); switch (node->type) { @@ -2505,9 +2644,9 @@ tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, right subexpression. */ uni = (tre_union_t *)node->obj; if (uni->left->nullable) - STACK_PUSHX(stack, uni->left) + STACK_PUSHX(stack, voidptr, uni->left) else if (uni->right->nullable) - STACK_PUSHX(stack, uni->right) + STACK_PUSHX(stack, voidptr, uni->right) else assert(0); break; @@ -2517,8 +2656,8 @@ tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, cat = (tre_catenation_t *)node->obj; assert(cat->left->nullable); assert(cat->right->nullable); - STACK_PUSHX(stack, cat->left); - STACK_PUSHX(stack, cat->right); + STACK_PUSHX(stack, voidptr, cat->left); + STACK_PUSHX(stack, voidptr, cat->right); break; case ITERATION: @@ -2526,7 +2665,7 @@ tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, all, so we go through the argument if possible. */ iter = (tre_iteration_t *)node->obj; if (iter->arg->nullable) - STACK_PUSHX(stack, iter->arg); + STACK_PUSHX(stack, voidptr, iter->arg); break; default: @@ -2554,16 +2693,16 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) { int bottom = tre_stack_num_objects(stack); - STACK_PUSHR(stack, tree); - STACK_PUSHR(stack, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, tree); + STACK_PUSHR(stack, int, NFL_RECURSE); while (tre_stack_num_objects(stack) > bottom) { tre_nfl_stack_symbol_t symbol; tre_ast_node_t *node; - symbol = (tre_nfl_stack_symbol_t) tre_stack_pop(stack); - node = tre_stack_pop(stack); + symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack); + node = tre_stack_pop_voidptr(stack); switch (symbol) { case NFL_RECURSE: @@ -2583,13 +2722,13 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) return REG_ESPACE; node->lastpos = tre_set_one(mem, lit->position, 0, TRE_CHAR_MAX, 0, NULL, - lit->code_max); + (int)lit->code_max); if (!node->lastpos) return REG_ESPACE; } else if (lit->code_min < 0) { - /* Tags, empty strings and zero width assertions: + /* Tags, empty strings, params, and zero width assertions: nullable = true, firstpos = {}, and lastpos = {}. */ node->nullable = 1; node->firstpos = tre_set_empty(mem); @@ -2605,13 +2744,14 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) lastpos = {i}. */ node->nullable = 0; node->firstpos = - tre_set_one(mem, lit->position, lit->code_min, - lit->code_max, 0, NULL, -1); + tre_set_one(mem, lit->position, (int)lit->code_min, + (int)lit->code_max, 0, NULL, -1); if (!node->firstpos) return REG_ESPACE; node->lastpos = tre_set_one(mem, lit->position, - lit->code_min, lit->code_max, - lit->class, lit->neg_classes, + (int)lit->code_min, + (int)lit->code_max, + lit->u.class, lit->neg_classes, -1); if (!node->lastpos) return REG_ESPACE; @@ -2622,32 +2762,32 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) case UNION: /* Compute the attributes for the two subtrees, and after that for this node. */ - STACK_PUSHR(stack, node); - STACK_PUSHR(stack, NFL_POST_UNION); - STACK_PUSHR(stack, ((tre_union_t *)node->obj)->right); - STACK_PUSHR(stack, NFL_RECURSE); - STACK_PUSHR(stack, ((tre_union_t *)node->obj)->left); - STACK_PUSHR(stack, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, node); + STACK_PUSHR(stack, int, NFL_POST_UNION); + STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right); + STACK_PUSHR(stack, int, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left); + STACK_PUSHR(stack, int, NFL_RECURSE); break; case CATENATION: /* Compute the attributes for the two subtrees, and after that for this node. */ - STACK_PUSHR(stack, node); - STACK_PUSHR(stack, NFL_POST_CATENATION); - STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->right); - STACK_PUSHR(stack, NFL_RECURSE); - STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->left); - STACK_PUSHR(stack, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, node); + STACK_PUSHR(stack, int, NFL_POST_CATENATION); + STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right); + STACK_PUSHR(stack, int, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left); + STACK_PUSHR(stack, int, NFL_RECURSE); break; case ITERATION: /* Compute the attributes for the subtree, and after that for this node. */ - STACK_PUSHR(stack, node); - STACK_PUSHR(stack, NFL_POST_ITERATION); - STACK_PUSHR(stack, ((tre_iteration_t *)node->obj)->arg); - STACK_PUSHR(stack, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, node); + STACK_PUSHR(stack, int, NFL_POST_ITERATION); + STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg); + STACK_PUSHR(stack, int, NFL_RECURSE); break; } break; /* end case: NFL_RECURSE */ @@ -2682,7 +2822,8 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) case NFL_POST_CATENATION: { - int num_tags, *tags, assertions; + int num_tags, *tags, assertions, params_seen; + int *params; reg_errcode_t status; tre_catenation_t *cat = node->obj; node->nullable = cat->left->nullable && cat->right->nullable; @@ -2691,9 +2832,11 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) if (cat->left->nullable) { /* The left side matches the empty string. Make a first pass - with tre_match_empty() to get the number of tags. */ + with tre_match_empty() to get the number of tags and + parameters. */ status = tre_match_empty(stack, cat->left, - NULL, NULL, &num_tags); + NULL, NULL, NULL, &num_tags, + ¶ms_seen); if (status != REG_OK) return status; /* Allocate arrays for the tags and parameters. */ @@ -2703,9 +2846,9 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) tags[0] = -1; assertions = 0; /* Second pass with tre_mach_empty() to get the list of - tags. */ + tags and parameters. */ status = tre_match_empty(stack, cat->left, tags, - &assertions, NULL); + &assertions, params, NULL, NULL); if (status != REG_OK) { xfree(tags); @@ -2727,9 +2870,11 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) if (cat->right->nullable) { /* The right side matches the empty string. Make a first pass - with tre_match_empty() to get the number of tags. */ + with tre_match_empty() to get the number of tags and + parameters. */ status = tre_match_empty(stack, cat->right, - NULL, NULL, &num_tags); + NULL, NULL, NULL, &num_tags, + ¶ms_seen); if (status != REG_OK) return status; /* Allocate arrays for the tags and parameters. */ @@ -2739,9 +2884,9 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) tags[0] = -1; assertions = 0; /* Second pass with tre_mach_empty() to get the list of - tags. */ + tags and parameters. */ status = tre_match_empty(stack, cat->right, tags, - &assertions, NULL); + &assertions, params, NULL, NULL); if (status != REG_OK) { xfree(tags); @@ -2814,7 +2959,6 @@ tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2, detect it here. */ if (trans->state_id == p2->position) { - DPRINT(("*")); break; } #endif @@ -2903,39 +3047,6 @@ tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2, trans->tags[l] = -1; } - -#ifdef TRE_DEBUG - { - int *tags; - - DPRINT((" %2d -> %2d on %3d", p1->position, p2->position, - p1->code_min)); - if (p1->code_max != p1->code_min) - DPRINT(("-%3d", p1->code_max)); - tags = trans->tags; - if (tags) - { - DPRINT((", tags [")); - while (*tags >= 0) - { - DPRINT(("%d", *tags)); - tags++; - if (*tags >= 0) - DPRINT((",")); - } - DPRINT(("]")); - } - if (trans->assertions) - DPRINT((", assert %d", trans->assertions)); - if (trans->assertions & ASSERT_BACKREF) - DPRINT((", backref %d", trans->u.backref)); - else if (trans->class) - DPRINT((", class %ld", (long)trans->class)); - if (trans->neg_classes) - DPRINT((", neg_classes %p", trans->neg_classes)); - DPRINT(("\n")); - } -#endif /* TRE_DEBUG */ p2++; } p1++; @@ -3017,63 +3128,18 @@ tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions, } -static void -tre_free(regex_t *preg) -{ - tre_tnfa_t *tnfa; - unsigned int i; - tre_tnfa_transition_t *trans; - - tnfa = (void *)preg->TRE_REGEX_T_FIELD; - if (!tnfa) - return; - - for (i = 0; i < tnfa->num_transitions; i++) - if (tnfa->transitions[i].state) - { - if (tnfa->transitions[i].tags) - xfree(tnfa->transitions[i].tags); - if (tnfa->transitions[i].neg_classes) - xfree(tnfa->transitions[i].neg_classes); - } - if (tnfa->transitions) - xfree(tnfa->transitions); - - if (tnfa->initial) - { - for (trans = tnfa->initial; trans->state; trans++) - { - if (trans->tags) - xfree(trans->tags); - } - xfree(tnfa->initial); - } - - if (tnfa->submatch_data) - { - for (i = 0; i < tnfa->num_submatches; i++) - if (tnfa->submatch_data[i].parents) - xfree(tnfa->submatch_data[i].parents); - xfree(tnfa->submatch_data); - } - - if (tnfa->tag_directions) - xfree(tnfa->tag_directions); - xfree(tnfa); -} - - #define ERROR_EXIT(err) \ do \ { \ errcode = err; \ - if (1) goto error_exit; \ + if (/*CONSTCOND*/1) \ + goto error_exit; \ } \ - while (0) + while (/*CONSTCOND*/0) -static int -tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) +int +regcomp(regex_t *preg, const char *regex, int cflags) { tre_stack_t *stack; tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r; @@ -3108,16 +3174,19 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) parse_ctx.mem = mem; parse_ctx.stack = stack; parse_ctx.re = regex; - parse_ctx.len = n; parse_ctx.cflags = cflags; parse_ctx.max_backref = -1; - DPRINT(("tre_compile: parsing '%.*" STRF "'\n", n, regex)); errcode = tre_parse(&parse_ctx); if (errcode != REG_OK) ERROR_EXIT(errcode); - preg->re_nsub = preg->__nsub2 = parse_ctx.submatch_id - 1; + preg->re_nsub = parse_ctx.submatch_id - 1; tree = parse_ctx.result; + /* Back references and approximate matching cannot currently be used + in the same regexp. */ + if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx) + ERROR_EXIT(REG_BADPAT); + #ifdef TRE_DEBUG tre_ast_print(tree); #endif /* TRE_DEBUG */ @@ -3131,21 +3200,18 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) if (tnfa == NULL) ERROR_EXIT(REG_ESPACE); tnfa->have_backrefs = parse_ctx.max_backref >= 0; + tnfa->have_approx = parse_ctx.have_approx; tnfa->num_submatches = parse_ctx.submatch_id; /* Set up tags for submatch addressing. If REG_NOSUB is set and the regexp does not have back references, this can be skipped. */ if (tnfa->have_backrefs || !(cflags & REG_NOSUB)) { - DPRINT(("tre_compile: setting up tags\n")); /* Figure out how many tags we will need. */ errcode = tre_add_tags(NULL, stack, tree, tnfa); if (errcode != REG_OK) ERROR_EXIT(errcode); -#ifdef TRE_DEBUG - tre_ast_print(tree); -#endif /* TRE_DEBUG */ if (tnfa->num_tags > 0) { @@ -3157,8 +3223,13 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) memset(tag_directions, -1, sizeof(*tag_directions) * (tnfa->num_tags + 1)); } + tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1, + sizeof(tnfa->minimal_tags)); + if (tnfa->minimal_tags == NULL) + ERROR_EXIT(REG_ESPACE); - submatch_data = xcalloc(parse_ctx.submatch_id, sizeof(*submatch_data)); + submatch_data = xcalloc((unsigned)parse_ctx.submatch_id, + sizeof(*submatch_data)); if (submatch_data == NULL) ERROR_EXIT(REG_ESPACE); tnfa->submatch_data = submatch_data; @@ -3167,20 +3238,11 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) if (errcode != REG_OK) ERROR_EXIT(errcode); -#ifdef TRE_DEBUG - for (i = 0; i < parse_ctx.submatch_id; i++) - DPRINT(("pmatch[%d] = {t%d, t%d}\n", - i, submatch_data[i].so_tag, submatch_data[i].eo_tag)); - for (i = 0; i < tnfa->num_tags; i++) - DPRINT(("t%d is %s\n", i, - tag_directions[i] == TRE_TAG_MINIMIZE ? - "minimized" : "maximized")); -#endif /* TRE_DEBUG */ } /* Expand iteration nodes. */ errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position, - tag_directions, NULL); + tag_directions, &tnfa->params_depth); if (errcode != REG_OK) ERROR_EXIT(errcode); @@ -3197,11 +3259,6 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) if (tree == NULL) ERROR_EXIT(REG_ESPACE); -#ifdef TRE_DEBUG - tre_ast_print(tree); - DPRINT(("Number of states: %d\n", parse_ctx.position)); -#endif /* TRE_DEBUG */ - errcode = tre_compute_nfl(mem, stack, tree); if (errcode != REG_OK) ERROR_EXIT(errcode); @@ -3225,49 +3282,27 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) add += counts[i] + 1; counts[i] = 0; } - transitions = xcalloc(add + 1, sizeof(*transitions)); + transitions = xcalloc((unsigned)add + 1, sizeof(*transitions)); if (transitions == NULL) ERROR_EXIT(REG_ESPACE); tnfa->transitions = transitions; tnfa->num_transitions = add; - DPRINT(("Converting to TNFA:\n")); errcode = tre_ast_to_tnfa(tree, transitions, counts, offs); if (errcode != REG_OK) ERROR_EXIT(errcode); + tnfa->firstpos_chars = NULL; + p = tree->firstpos; i = 0; while (p->position >= 0) { i++; - -#ifdef TRE_DEBUG - { - int *tags; - DPRINT(("initial: %d", p->position)); - tags = p->tags; - if (tags != NULL) - { - if (*tags >= 0) - DPRINT(("/")); - while (*tags >= 0) - { - DPRINT(("%d", *tags)); - tags++; - if (*tags >= 0) - DPRINT((",")); - } - } - DPRINT((", assert %d", p->assertions)); - DPRINT(("\n")); - } -#endif /* TRE_DEBUG */ - p++; } - initial = xcalloc(i + 1, sizeof(tre_tnfa_transition_t)); + initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t)); if (initial == NULL) ERROR_EXIT(REG_ESPACE); tnfa->initial = initial; @@ -3278,7 +3313,7 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) initial[i].state = transitions + offs[p->position]; initial[i].state_id = p->position; initial[i].tags = NULL; - /* Copy the arrays p->tags, they are allocated + /* Copy the arrays p->tags, and p->params, they are allocated from a tre_mem object. */ if (p->tags) { @@ -3299,8 +3334,6 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) tnfa->num_states = parse_ctx.position; tnfa->cflags = cflags; - DPRINT(("final state %p\n", (void *)tnfa->final)); - tre_mem_destroy(mem); tre_stack_destroy(stack); xfree(counts); @@ -3319,44 +3352,58 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) if (offs != NULL) xfree(offs); preg->TRE_REGEX_T_FIELD = (void *)tnfa; - tre_free(preg); + regfree(preg); return errcode; } -/*********************************************************************** - from regcomp.c -***********************************************************************/ -int -regcomp(regex_t *preg, const char *regex, int cflags) + +void +regfree(regex_t *preg) { - int ret; - tre_char_t *wregex; - size_t n = strlen(regex); + tre_tnfa_t *tnfa; + unsigned int i; + tre_tnfa_transition_t *trans; - if (n+1 > SIZE_MAX/sizeof(tre_char_t)) - return REG_ESPACE; - wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); - if (wregex == NULL) - return REG_ESPACE; + tnfa = (void *)preg->TRE_REGEX_T_FIELD; + if (!tnfa) + return; - n = mbstowcs(wregex, regex, n+1); - if (n == (size_t)-1) { - xfree(wregex); - return REG_BADPAT; - } + for (i = 0; i < tnfa->num_transitions; i++) + if (tnfa->transitions[i].state) + { + if (tnfa->transitions[i].tags) + xfree(tnfa->transitions[i].tags); + if (tnfa->transitions[i].neg_classes) + xfree(tnfa->transitions[i].neg_classes); + } + if (tnfa->transitions) + xfree(tnfa->transitions); - ret = tre_compile(preg, wregex, n, cflags); - xfree(wregex); + if (tnfa->initial) + { + for (trans = tnfa->initial; trans->state; trans++) + { + if (trans->tags) + xfree(trans->tags); + } + xfree(tnfa->initial); + } - return ret; -} + if (tnfa->submatch_data) + { + for (i = 0; i < tnfa->num_submatches; i++) + if (tnfa->submatch_data[i].parents) + xfree(tnfa->submatch_data[i].parents); + xfree(tnfa->submatch_data); + } -void -regfree(regex_t *preg) -{ - tre_free(preg); + if (tnfa->tag_directions) + xfree(tnfa->tag_directions); + if (tnfa->firstpos_chars) + xfree(tnfa->firstpos_chars); + if (tnfa->minimal_tags) + xfree(tnfa->minimal_tags); + xfree(tnfa); } - -/* EOF */ diff --git a/src/regex/regerror.c b/src/regex/regerror.c index 39d70b2a..17119fff 100644 --- a/src/regex/regerror.c +++ b/src/regex/regerror.c @@ -1,26 +1,6 @@ -/* - regerror.c - POSIX regerror() implementation for TRE. - - Copyright (c) 2001-2006 Ville Laurikari . - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - -*/ - #include #include +#include /* Error message strings for error codes listed in `regex.h'. This list needs to be in sync with the codes listed there, naturally. */ @@ -28,7 +8,7 @@ /* Converted to single string by Rich Felker to remove the need for * data relocations at runtime, 27 Feb 2006. */ -static const char tre_error_messages[] = { +static const char messages[] = { "No error\0" "No match\0" "Invalid regexp\0" @@ -43,33 +23,13 @@ static const char tre_error_messages[] = { "Invalid character range\0" "Out of memory\0" "XXX\0" + "\0Unknown error" }; -size_t -regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) +size_t regerror(int e, const regex_t *preg, char *buf, size_t size) { - const char *err; - size_t err_len; - - if (errcode >= 0 && errcode <= REG_BADRPT) - for (err=tre_error_messages; errcode; errcode--, err+=strlen(err)+1); - else - err = "Unknown error"; - - err_len = strlen(err) + 1; - if (errbuf_size > 0 && errbuf != NULL) - { - if (err_len > errbuf_size) - { - memcpy(errbuf, err, errbuf_size - 1); - errbuf[errbuf_size - 1] = '\0'; - } - else - { - strcpy(errbuf, err); - } - } - return err_len; + const char *s; + for (s=messages; e && *s; e--, e+=strlen(s)+1); + if (!*s) s++; + return 1+snprintf(buf, size, "%s", s); } - -/* EOF */ diff --git a/src/regex/regexec.c b/src/regex/regexec.c index f7aef506..79874ca5 100644 --- a/src/regex/regexec.c +++ b/src/regex/regexec.c @@ -1,21 +1,31 @@ /* regexec.c - TRE POSIX compatible matching functions (and more). - Copyright (c) 2001-2006 Ville Laurikari . - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Copyright (c) 2001-2009 Ville Laurikari + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ @@ -48,13 +58,40 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, str_byte += pos_add_next; \ } while (0) +#define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum(c)) + #define CHECK_ASSERTIONS(assertions) \ (((assertions & ASSERT_AT_BOL) \ && (pos > 0 || reg_notbol) \ && (prev_c != L'\n' || !reg_newline)) \ || ((assertions & ASSERT_AT_EOL) \ && (next_c != L'\0' || reg_noteol) \ - && (next_c != L'\n' || !reg_newline))) + && (next_c != L'\n' || !reg_newline)) \ + || ((assertions & ASSERT_AT_BOW) \ + && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \ + || ((assertions & ASSERT_AT_EOW) \ + && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \ + || ((assertions & ASSERT_AT_WB) \ + && (pos != 0 && next_c != L'\0' \ + && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \ + || ((assertions & ASSERT_AT_WB_NEG) \ + && (pos == 0 || next_c == L'\0' \ + || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c)))) + +#define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \ + (((trans_i->assertions & ASSERT_CHAR_CLASS) \ + && !(tnfa->cflags & REG_ICASE) \ + && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class)) \ + || ((trans_i->assertions & ASSERT_CHAR_CLASS) \ + && (tnfa->cflags & REG_ICASE) \ + && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class) \ + && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \ + || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) \ + && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\ + tnfa->cflags & REG_ICASE))) + + + /* Returns 1 if `t1' wins `t2', 0 otherwise. */ static int @@ -86,7 +123,6 @@ tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, static int tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase) { - DPRINT(("neg_char_classes_test: %p, %d, %d\n", classes, wc, icase)); while (*classes != (tre_ctype_t)0) if ((!icase && tre_isctype(wc, *classes)) || (icase && (tre_isctype(tre_toupper(wc), *classes) @@ -118,7 +154,6 @@ tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase) See `tre-match-backtrack.c'. */ - typedef struct { tre_tnfa_transition_t *state; int *tags; @@ -130,41 +165,16 @@ typedef struct { } tre_reach_pos_t; -#ifdef TRE_DEBUG -static void -tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) -{ - int i; - - while (reach->state != NULL) - { - DPRINT((" %p", (void *)reach->state)); - if (num_tags > 0) - { - DPRINT(("/")); - for (i = 0; i < num_tags; i++) - { - DPRINT(("%d:%d", i, reach->tags[i])); - if (i < (num_tags-1)) - DPRINT((",")); - } - } - reach++; - } - DPRINT(("\n")); - -} -#endif /* TRE_DEBUG */ - static reg_errcode_t -tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, - int *match_tags, int eflags, int *match_end_ofs) +tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, + int *match_tags, int eflags, + int *match_end_ofs) { /* State variables required by GET_NEXT_WCHAR. */ tre_char_t prev_c = 0, next_c = 0; const char *str_byte = string; int pos = -1; - int pos_add_next = 1; + unsigned int pos_add_next = 1; #ifdef TRE_MBSTATE mbstate_t mbstate; #endif /* TRE_MBSTATE */ @@ -210,14 +220,10 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes; /* Allocate the memory. */ -#ifdef TRE_USE_ALLOCA - buf = alloca(total_bytes); -#else /* !TRE_USE_ALLOCA */ - buf = xmalloc(total_bytes); -#endif /* !TRE_USE_ALLOCA */ + buf = xmalloc((unsigned)total_bytes); if (buf == NULL) return REG_ESPACE; - memset(buf, 0, total_bytes); + memset(buf, 0, (size_t)total_bytes); /* Get the various pointers within tmp_buf (properly aligned). */ tmp_tags = (void *)buf; @@ -247,17 +253,12 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, GET_NEXT_WCHAR(); pos = 0; - DPRINT(("length: %d\n", len)); - DPRINT(("pos:chr/code | states and tags\n")); - DPRINT(("-------------+------------------------------------------------\n")); - reach_next_i = reach_next; while (1) { /* If no match found yet, add the initial states to `reach_next'. */ if (match_eo < 0) { - DPRINT((" init >")); trans_i = tnfa->initial; while (trans_i->state != NULL) { @@ -266,12 +267,10 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions)) { - DPRINT(("assertion failed\n")); trans_i++; continue; } - DPRINT((" %p", (void *)trans_i->state)); reach_next_i->state = trans_i->state; for (i = 0; i < num_tags; i++) reach_next_i->tags[i] = -1; @@ -285,7 +284,6 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, } if (reach_next_i->state == tnfa->final) { - DPRINT((" found empty match\n")); match_eo = pos; new_match = 1; for (i = 0; i < num_tags; i++) @@ -297,7 +295,6 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, } trans_i++; } - DPRINT(("\n")); reach_next_i->state = NULL; } else @@ -312,18 +309,53 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, GET_NEXT_WCHAR(); -#ifdef TRE_DEBUG - DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c)); - tre_print_reach(tnfa, reach_next, num_tags); - DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c)); - tre_print_reach(tnfa, reach_next, num_tags); -#endif /* TRE_DEBUG */ - /* Swap `reach' and `reach_next'. */ reach_i = reach; reach = reach_next; reach_next = reach_i; + /* For each state in `reach', weed out states that don't fulfill the + minimal matching conditions. */ + if (tnfa->num_minimals && new_match) + { + new_match = 0; + reach_next_i = reach_next; + for (reach_i = reach; reach_i->state; reach_i++) + { + int skip = 0; + for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) + { + int end = tnfa->minimal_tags[i]; + int start = tnfa->minimal_tags[i + 1]; + if (end >= num_tags) + { + skip = 1; + break; + } + else if (reach_i->tags[start] == match_tags[start] + && reach_i->tags[end] < match_tags[end]) + { + skip = 1; + break; + } + } + if (!skip) + { + reach_next_i->state = reach_i->state; + tmp_iptr = reach_next_i->tags; + reach_next_i->tags = reach_i->tags; + reach_i->tags = tmp_iptr; + reach_next_i++; + } + } + reach_next_i->state = NULL; + + /* Swap `reach' and `reach_next'. */ + reach_i = reach; + reach = reach_next; + reach_next = reach_i; + } + /* For each state in `reach' see if there is a transition leaving with the current input symbol to a state not yet in `reach_next', and add the destination states to `reach_next'. */ @@ -333,28 +365,13 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, for (trans_i = reach_i->state; trans_i->state; trans_i++) { /* Does this transition match the input symbol? */ - if (trans_i->code_min <= prev_c && - trans_i->code_max >= prev_c) + if (trans_i->code_min <= (tre_cint_t)prev_c && + trans_i->code_max >= (tre_cint_t)prev_c) { if (trans_i->assertions && (CHECK_ASSERTIONS(trans_i->assertions) - /* Handle character class transitions. */ - || ((trans_i->assertions & ASSERT_CHAR_CLASS) - && !(tnfa->cflags & REG_ICASE) - && !tre_isctype((tre_cint_t)prev_c, - trans_i->u.class)) - || ((trans_i->assertions & ASSERT_CHAR_CLASS) - && (tnfa->cflags & REG_ICASE) - && (!tre_isctype(tre_tolower((tre_cint_t)prev_c), - trans_i->u.class) - && !tre_isctype(tre_toupper((tre_cint_t)prev_c), - trans_i->u.class))) - || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) - && tre_neg_char_classes_match(trans_i->neg_classes, - (tre_cint_t)prev_c, - tnfa->cflags & REG_ICASE)))) + || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) { - DPRINT(("assertion failed\n")); continue; } @@ -385,7 +402,6 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, || (num_tags > 0 && reach_next_i->tags[0] <= match_tags[0]))) { - DPRINT((" found match %p\n", trans_i->state)); match_eo = pos; new_match = 1; for (i = 0; i < num_tags; i++) @@ -409,7 +425,6 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, *reach_pos[trans_i->state_id].tags = tmp_tags; if (trans_i->state == tnfa->final) { - DPRINT((" found better match\n")); match_eo = pos; new_match = 1; for (i = 0; i < num_tags; i++) @@ -424,18 +439,15 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, reach_next_i->state = NULL; } - DPRINT(("match end offset = %d\n", match_eo)); - -#ifndef TRE_USE_ALLOCA if (buf) xfree(buf); -#endif /* !TRE_USE_ALLOCA */ *match_end_ofs = match_eo; return match_eo >= 0 ? REG_OK : REG_NOMATCH; } + /*********************************************************************** from tre-match-backtrack.c ***********************************************************************/ @@ -489,16 +501,9 @@ typedef struct tre_backtrack_struct { #define BT_STACK_MBSTATE_OUT #endif /* !TRE_MBSTATE */ - -#ifdef TRE_USE_ALLOCA -#define tre_bt_mem_new tre_mem_newa -#define tre_bt_mem_alloc tre_mem_alloca -#define tre_bt_mem_destroy(obj) do { } while (0) -#else /* !TRE_USE_ALLOCA */ #define tre_bt_mem_new tre_mem_new #define tre_bt_mem_alloc tre_mem_alloc #define tre_bt_mem_destroy tre_mem_destroy -#endif /* !TRE_USE_ALLOCA */ #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \ @@ -572,14 +577,13 @@ typedef struct tre_backtrack_struct { static reg_errcode_t tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, - int len, int *match_tags, - int eflags, int *match_end_ofs) + int *match_tags, int eflags, int *match_end_ofs) { /* State variables required by GET_NEXT_WCHAR. */ tre_char_t prev_c = 0, next_c = 0; const char *str_byte = string; int pos = 0; - int pos_add_next = 1; + unsigned int pos_add_next = 1; #ifdef TRE_MBSTATE mbstate_t mbstate; #endif /* TRE_MBSTATE */ @@ -597,9 +601,6 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, mbstate_t mbstate_start; #endif /* TRE_MBSTATE */ - /* Compilation flags for this regexp. */ - int cflags = tnfa->cflags; - /* End offset of best match so far, or -1 if no match found yet. */ int match_eo = -1; /* Tag arrays. */ @@ -633,30 +634,33 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, stack->prev = NULL; stack->next = NULL; -#ifdef TRE_USE_ALLOCA - tags = alloca(sizeof(*tags) * tnfa->num_tags); - pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches); - states_seen = alloca(sizeof(*states_seen) * tnfa->num_states); -#else /* !TRE_USE_ALLOCA */ - tags = xmalloc(sizeof(*tags) * tnfa->num_tags); - if (!tags) + if (tnfa->num_tags) { - ret = REG_ESPACE; - goto error_exit; + tags = xmalloc(sizeof(*tags) * tnfa->num_tags); + if (!tags) + { + ret = REG_ESPACE; + goto error_exit; + } } - pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches); - if (!pmatch) + if (tnfa->num_submatches) { - ret = REG_ESPACE; - goto error_exit; + pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches); + if (!pmatch) + { + ret = REG_ESPACE; + goto error_exit; + } } - states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states); - if (!states_seen) + if (tnfa->num_states) { - ret = REG_ESPACE; - goto error_exit; + states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states); + if (!states_seen) + { + ret = REG_ESPACE; + goto error_exit; + } } -#endif /* !TRE_USE_ALLOCA */ retry: { @@ -685,10 +689,8 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, next_tags = NULL; for (trans_i = tnfa->initial; trans_i->state; trans_i++) { - DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c)); if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions)) { - DPRINT(("assert failed\n")); continue; } if (state == NULL) @@ -700,8 +702,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, else { /* Backtrack to this state. */ - DPRINT(("saving state %d for backtracking\n", trans_i->state_id)); - BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state, + BT_STACK_PUSH(pos, str_byte, 0, trans_i->state, trans_i->state_id, next_c, tags, mbstate); { int *tmp = trans_i->tags; @@ -717,22 +718,16 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, tags[*next_tags] = pos; - DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte)); - DPRINT(("pos:chr/code | state and tags\n")); - DPRINT(("-------------+------------------------------------------------\n")); - if (state == NULL) goto backtrack; while (1) { - tre_tnfa_transition_t *trans_i, *next_state; + tre_tnfa_transition_t *next_state; int empty_br_match; - DPRINT(("start loop\n")); if (state == tnfa->final) { - DPRINT((" match found, %d %d\n", match_eo, pos)); if (match_eo < pos || (match_eo == pos && match_tags @@ -741,7 +736,6 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, { int i; /* This match wins the previous match. */ - DPRINT((" win previous\n")); match_eo = pos; if (match_tags) for (i = 0; i < tnfa->num_tags; i++) @@ -752,17 +746,6 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, goto backtrack; } -#ifdef TRE_DEBUG - DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c, - state)); - { - int i; - for (i = 0; i < tnfa->num_tags; i++) - DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : "")); - DPRINT(("\n")); - } -#endif /* TRE_DEBUG */ - /* Go to the next character in the input string. */ empty_br_match = 0; trans_i = state; @@ -775,26 +758,17 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, int bt_len; int result; - DPRINT((" should match back reference %d\n", bt)); /* Get the substring we need to match against. Remember to turn off REG_NOSUB temporarily. */ - tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & !REG_NOSUB, + tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB, tnfa, tags, pos); so = pmatch[bt].rm_so; eo = pmatch[bt].rm_eo; bt_len = eo - so; - if (len < 0) - { - result = strncmp((char*)string + so, str_byte - 1, bt_len); - } - else if (len - pos < bt_len) - result = 1; - else - result = memcmp((char*)string + so, str_byte - 1, bt_len); + result = strncmp((const char*)string + so, str_byte - 1, + (size_t)bt_len); - /* We can ignore multibyte characters here because the backref - string is already aligned at character boundaries. */ if (result == 0) { /* Back reference matched. Check for infinite loop. */ @@ -802,7 +776,6 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, empty_br_match = 1; if (empty_br_match && states_seen[trans_i->state_id]) { - DPRINT((" avoid loop\n")); goto backtrack; } @@ -810,31 +783,20 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, /* Advance in input string and resync `prev_c', `next_c' and pos. */ - DPRINT((" back reference matched\n")); str_byte += bt_len - 1; pos += bt_len - 1; GET_NEXT_WCHAR(); - DPRINT((" pos now %d\n", pos)); } else { - DPRINT((" back reference did not match\n")); goto backtrack; } } else { /* Check for end of string. */ - if (len < 0) - { - if (next_c == L'\0') - goto backtrack; - } - else - { - if (pos >= len) + if (next_c == L'\0') goto backtrack; - } /* Read the next character. */ GET_NEXT_WCHAR(); @@ -843,49 +805,29 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, next_state = NULL; for (trans_i = state; trans_i->state; trans_i++) { - DPRINT((" transition %d-%d (%c-%c) %d to %d\n", - trans_i->code_min, trans_i->code_max, - trans_i->code_min, trans_i->code_max, - trans_i->assertions, trans_i->state_id)); - if (trans_i->code_min <= prev_c && trans_i->code_max >= prev_c) + if (trans_i->code_min <= (tre_cint_t)prev_c + && trans_i->code_max >= (tre_cint_t)prev_c) { if (trans_i->assertions && (CHECK_ASSERTIONS(trans_i->assertions) - /* Handle character class transitions. */ - || ((trans_i->assertions & ASSERT_CHAR_CLASS) - && !(cflags & REG_ICASE) - && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class)) - || ((trans_i->assertions & ASSERT_CHAR_CLASS) - && (cflags & REG_ICASE) - && (!tre_isctype(tre_tolower((tre_cint_t)prev_c), - trans_i->u.class) - && !tre_isctype(tre_toupper((tre_cint_t)prev_c), - trans_i->u.class))) - || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) - && tre_neg_char_classes_match(trans_i->neg_classes, - (tre_cint_t)prev_c, - cflags & REG_ICASE)))) + || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) { - DPRINT((" assertion failed\n")); continue; } if (next_state == NULL) { /* First matching transition. */ - DPRINT((" Next state is %d\n", trans_i->state_id)); next_state = trans_i->state; next_tags = trans_i->tags; } else { - /* Second mathing transition. We may need to backtrack here + /* Second matching transition. We may need to backtrack here to take this transition instead of the first one, so we push this transition in the backtracking stack so we can jump back here if needed. */ - DPRINT((" saving state %d for backtracking\n", - trans_i->state_id)); - BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state, + BT_STACK_PUSH(pos, str_byte, 0, trans_i->state, trans_i->state_id, next_c, tags, mbstate); { int *tmp; @@ -916,11 +858,8 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, /* A matching transition was not found. Try to backtrack. */ if (stack->prev) { - DPRINT((" backtracking\n")); if (stack->item.state->assertions & ASSERT_BACKREF) { - DPRINT((" states_seen[%d] = 0\n", - stack->item.state_id)); states_seen[stack->item.state_id] = 0; } @@ -930,23 +869,10 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, { /* Try starting from a later position in the input string. */ /* Check for end of string. */ - if (len < 0) - { - if (next_c == L'\0') - { - DPRINT(("end of string.\n")); - break; - } - } - else - { - if (pos >= len) + if (next_c == L'\0') { - DPRINT(("end of string.\n")); break; } - } - DPRINT(("restarting from next start position\n")); next_c = next_c_start; #ifdef TRE_MBSTATE mbstate = mbstate_start; @@ -956,7 +882,6 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, } else { - DPRINT(("finished\n")); break; } } @@ -979,7 +904,6 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, return ret; } - /*********************************************************************** from regexec.c ***********************************************************************/ @@ -998,7 +922,6 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, if (match_eo >= 0 && !(cflags & REG_NOSUB)) { /* Construct submatch offsets from the tags. */ - DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo)); submatch_data = tnfa->submatch_data; while (i < tnfa->num_submatches && i < nmatch) { @@ -1017,9 +940,6 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) pmatch[i].rm_so = pmatch[i].rm_eo = -1; - DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i, - submatch_data[i].so_tag, pmatch[i].rm_so, - submatch_data[i].eo_tag, pmatch[i].rm_eo)); i++; } /* Reset all submatches that are not within all of their parent @@ -1035,7 +955,6 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, if (parents != NULL) for (j = 0; parents[j] >= 0; j++) { - DPRINT(("pmatch[%d] parent %d\n", i, parents[j])); if (pmatch[i].rm_so < pmatch[parents[j]].rm_so || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo) pmatch[i].rm_so = pmatch[i].rm_eo = -1; @@ -1057,19 +976,16 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, Wrapper functions for POSIX compatible regexp matching. */ -static int -tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, +int +regexec(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags) { + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; reg_errcode_t status; int *tags = NULL, eo; if (tnfa->num_tags > 0 && nmatch > 0) { -#ifdef TRE_USE_ALLOCA - tags = alloca(sizeof(*tags) * tnfa->num_tags); -#else /* !TRE_USE_ALLOCA */ tags = xmalloc(sizeof(*tags) * tnfa->num_tags); -#endif /* !TRE_USE_ALLOCA */ if (tags == NULL) return REG_ESPACE; } @@ -1078,30 +994,18 @@ tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, if (tnfa->have_backrefs) { /* The regex has back references, use the backtracking matcher. */ - status = tre_tnfa_run_backtrack(tnfa, string, len, tags, eflags, &eo); + status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo); } else { /* Exact matching, no back references, use the parallel matcher. */ - status = tre_tnfa_run_parallel(tnfa, string, len, tags, eflags, &eo); + status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo); } if (status == REG_OK) /* A match was found, so fill the submatch registers. */ tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); -#ifndef TRE_USE_ALLOCA if (tags) xfree(tags); -#endif /* !TRE_USE_ALLOCA */ return status; } - -int -regexec(const regex_t *preg, const char *str, - size_t nmatch, regmatch_t pmatch[], int eflags) -{ - return tre_match((void *)preg->TRE_REGEX_T_FIELD, str, -1, - nmatch, pmatch, eflags); -} - -/* EOF */ diff --git a/src/regex/tre-mem.c b/src/regex/tre-mem.c index d7bdd3db..86f809d4 100644 --- a/src/regex/tre-mem.c +++ b/src/regex/tre-mem.c @@ -1,21 +1,31 @@ /* tre-mem.c - TRE memory allocator - Copyright (c) 2001-2006 Ville Laurikari - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Copyright (c) 2001-2009 Ville Laurikari + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ @@ -31,6 +41,12 @@ #include "tre.h" +/* + This memory allocator is for allocating small memory blocks efficiently + in terms of memory overhead and execution speed. The allocated blocks + cannot be freed individually, only all at once. There can be multiple + allocators, though. +*/ /* Returns a new memory allocator or NULL if out of memory. */ tre_mem_t @@ -77,24 +93,9 @@ tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block, if (mem->failed) { - DPRINT(("tre_mem_alloc: oops, called after failure?!\n")); return NULL; } -#ifdef MALLOC_DEBUGGING - if (!provided) - { - ptr = xmalloc(1); - if (ptr == NULL) - { - DPRINT(("tre_mem_alloc: xmalloc forced failure\n")); - mem->failed = 1; - return NULL; - } - xfree(ptr); - } -#endif /* MALLOC_DEBUGGING */ - if (mem->n < size) { /* We need more memory than is available in the current block. @@ -102,10 +103,8 @@ tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block, tre_list_t *l; if (provided) { - DPRINT(("tre_mem_alloc: using provided block\n")); if (provided_block == NULL) { - DPRINT(("tre_mem_alloc: provided block was NULL\n")); mem->failed = 1; return NULL; } @@ -119,8 +118,6 @@ tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block, block_size = size * 8; else block_size = TRE_MEM_BLOCK_SIZE; - DPRINT(("tre_mem_alloc: allocating new %d byte block\n", - block_size)); l = xmalloc(sizeof(*l)); if (l == NULL) { @@ -159,5 +156,3 @@ tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block, return ptr; } - -/* EOF */ diff --git a/src/regex/tre.h b/src/regex/tre.h index bfd171f4..d6e1c2a7 100644 --- a/src/regex/tre.h +++ b/src/regex/tre.h @@ -1,21 +1,31 @@ /* tre-internal.h - TRE internal definitions - Copyright (c) 2001-2006 Ville Laurikari . - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Copyright (c) 2001-2009 Ville Laurikari + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ @@ -23,12 +33,7 @@ #include #include -#define TRE_MULTIBYTE 1 #undef TRE_MBSTATE -#define TRE_WCHAR 1 -#define TRE_USE_SYSTEM_WCTYPE 1 -#define HAVE_WCSTOMBS 1 -#define TRE_MB_CUR_MAX MB_CUR_MAX #define NDEBUG @@ -37,33 +42,16 @@ typedef int reg_errcode_t; typedef wchar_t tre_char_t; - -#ifdef TRE_DEBUG -#include -#define DPRINT(msg) do {printf msg; fflush(stdout);} while(0) -#else /* !TRE_DEBUG */ #define DPRINT(msg) do { } while(0) -#endif /* !TRE_DEBUG */ #define elementsof(x) ( sizeof(x) / sizeof(x[0]) ) -#if 1 -int __mbtowc(wchar_t *, const char *); -#define tre_mbrtowc(pwc, s, n, ps) (__mbtowc((pwc), (s))) -#else #define tre_mbrtowc(pwc, s, n, ps) (mbtowc((pwc), (s), (n))) -#endif /* Wide characters. */ typedef wint_t tre_cint_t; #define TRE_CHAR_MAX WCHAR_MAX -#ifdef TRE_MULTIBYTE -#define TRE_MB_CUR_MAX MB_CUR_MAX -#else /* !TRE_MULTIBYTE */ -#define TRE_MB_CUR_MAX 1 -#endif /* !TRE_MULTIBYTE */ - #define tre_isalnum iswalnum #define tre_isalpha iswalpha #define tre_isblank iswblank @@ -98,9 +86,6 @@ typedef wctype_t tre_ctype_t; #define MAX(a, b) (((a) >= (b)) ? (a) : (b)) #define MIN(a, b) (((a) <= (b)) ? (a) : (b)) -/* Define STRF to the correct printf formatter for strings. */ -#define STRF "ls" - /* TNFA transition type. A TNFA state is an array of transitions, the terminator is a transition with NULL `state'. */ typedef struct tnfa_transition tre_tnfa_transition_t; @@ -170,42 +155,21 @@ struct tnfa { tre_tnfa_transition_t *initial; tre_tnfa_transition_t *final; tre_submatch_data_t *submatch_data; + char *firstpos_chars; + int first_char; unsigned int num_submatches; tre_tag_direction_t *tag_directions; + int *minimal_tags; int num_tags; + int num_minimals; int end_tag; int num_states; int cflags; int have_backrefs; + int have_approx; + int params_depth; }; -#if 0 -static int -tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags); - -static void -tre_free(regex_t *preg); - -static void -tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, - const tre_tnfa_t *tnfa, int *tags, int match_eo); - -static reg_errcode_t -tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, - tre_str_type_t type, int *match_tags, int eflags, - int *match_end_ofs); - -static reg_errcode_t -tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, - tre_str_type_t type, int *match_tags, int eflags, - int *match_end_ofs); - -static reg_errcode_t -tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, - int len, tre_str_type_t type, int *match_tags, - int eflags, int *match_end_ofs); -#endif - /* from tre-mem.h: */ #define TRE_MEM_BLOCK_SIZE 1024 @@ -266,4 +230,3 @@ void tre_mem_destroy(tre_mem_t mem); #define xfree free #define xrealloc realloc -/* EOF */ -- 2.25.1