/*
regcomp.c - TRE POSIX compatible regex compilation functions.
- Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <assert.h>
+/***********************************************************************
+ from tre-compile.h
+***********************************************************************/
+
+typedef struct {
+ int position;
+ int code_min;
+ int code_max;
+ int *tags;
+ int assertions;
+ tre_ctype_t class;
+ tre_ctype_t *neg_classes;
+ int backref;
+ int *params;
+} tre_pos_and_tags_t;
+
+
/***********************************************************************
from tre-ast.c and tre-ast.h
***********************************************************************/
#define IS_TAG(x) ((x)->code_min == TAG)
#define IS_BACKREF(x) ((x)->code_min == BACKREF)
-/* Taken from tre-compile.h */
-typedef struct {
- int position;
- int code_min;
- int code_max;
- int *tags;
- int assertions;
- tre_ctype_t class;
- tre_ctype_t *neg_classes;
- int backref;
-} tre_pos_and_tags_t;
/* A generic AST node. All AST nodes consist of this node on the top
level with `obj' pointing to the actual content. */
long code_min;
long code_max;
int position;
- tre_ctype_t class;
+ union {
+ tre_ctype_t class;
+ int *params;
+ } u;
tre_ctype_t *neg_classes;
} tre_literal_t;
int min;
/* Maximum number of consecutive matches. */
int max;
+ /* If 0, match as many characters as possible, if 1 match as few as
+ possible. Note that this does not always mean the same thing as
+ matching as many/few repetitions as possible. */
+ unsigned int minimal:1;
} tre_iteration_t;
/* An "union" node. These are created for the "|" operator. */
tre_ast_node_t *right;
} tre_union_t;
+static tre_ast_node_t *
+tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
+
+static tre_ast_node_t *
+tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
+
+static tre_ast_node_t *
+tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
+ int minimal);
+
+static tre_ast_node_t *
+tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
+
+static tre_ast_node_t *
+tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
+ tre_ast_node_t *right);
+
+
static tre_ast_node_t *
tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
{
}
static tre_ast_node_t *
-tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max)
+tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
+ int minimal)
{
tre_ast_node_t *node;
tre_iteration_t *iter;
iter->arg = arg;
iter->min = min;
iter->max = max;
+ iter->minimal = minimal;
node->num_submatches = arg->num_submatches;
return node;
return node;
}
+
/***********************************************************************
from tre-stack.c and tre-stack.h
***********************************************************************/
+typedef struct tre_stack_rec tre_stack_t;
+
+/* Creates a new stack object. `size' is initial size in bytes, `max_size'
+ is maximum size, and `increment' specifies how much more space will be
+ allocated with realloc() if all space gets used up. Returns the stack
+ object or NULL if out of memory. */
+static tre_stack_t *
+tre_stack_new(int size, int max_size, int increment);
+
+/* Frees the stack object. */
+static void
+tre_stack_destroy(tre_stack_t *s);
+
+/* Returns the current number of objects in the stack. */
+static int
+tre_stack_num_objects(tre_stack_t *s);
+
+/* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
+ `value' on top of stack `s'. Returns REG_ESPACE if out of memory.
+ This tries to realloc() more space before failing if maximum size
+ has not yet been reached. Returns REG_OK if successful. */
+#define declare_pushf(typetag, type) \
+ static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
+
+declare_pushf(voidptr, void *);
+declare_pushf(int, int);
+
+/* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
+ element off of stack `s' and returns it. The stack must not be
+ empty. */
+#define declare_popf(typetag, type) \
+ static type tre_stack_pop_ ## typetag(tre_stack_t *s)
+
+declare_popf(voidptr, void *);
+declare_popf(int, int);
+
/* Just to save some typing. */
-#define STACK_PUSH(s, value) \
+#define STACK_PUSH(s, typetag, value) \
do \
{ \
- status = tre_stack_push(s, (void *)(value)); \
+ status = tre_stack_push_ ## typetag(s, value); \
} \
- while (0)
+ while (/*CONSTCOND*/0)
-#define STACK_PUSHX(s, value) \
+#define STACK_PUSHX(s, typetag, value) \
{ \
- status = tre_stack_push(s, (void *)(value)); \
+ status = tre_stack_push_ ## typetag(s, value); \
if (status != REG_OK) \
break; \
}
-#define STACK_PUSHR(s, value) \
+#define STACK_PUSHR(s, typetag, value) \
{ \
- reg_errcode_t status; \
- status = tre_stack_push(s, (void *)(value)); \
- if (status != REG_OK) \
- return status; \
+ reg_errcode_t _status; \
+ _status = tre_stack_push_ ## typetag(s, value); \
+ if (_status != REG_OK) \
+ return _status; \
}
-typedef struct tre_stack_rec {
+union tre_stack_item {
+ void *voidptr_value;
+ int int_value;
+};
+
+struct tre_stack_rec {
int size;
int max_size;
int increment;
int ptr;
- void **stack;
-} tre_stack_t;
+ union tre_stack_item *stack;
+};
static tre_stack_t *
}
static reg_errcode_t
-tre_stack_push(tre_stack_t *s, void *value)
+tre_stack_push(tre_stack_t *s, union tre_stack_item value)
{
if (s->ptr < s->size)
{
{
if (s->size >= s->max_size)
{
- DPRINT(("tre_stack_push: stack full\n"));
return REG_ESPACE;
}
else
{
- void **new_buffer;
+ union tre_stack_item *new_buffer;
int new_size;
- DPRINT(("tre_stack_push: trying to realloc more space\n"));
new_size = s->size + s->increment;
if (new_size > s->max_size)
new_size = s->max_size;
new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
if (new_buffer == NULL)
{
- DPRINT(("tre_stack_push: realloc failed.\n"));
return REG_ESPACE;
}
- DPRINT(("tre_stack_push: realloc succeeded.\n"));
assert(new_size > s->size);
s->size = new_size;
s->stack = new_buffer;
return REG_OK;
}
-static void *
-tre_stack_pop(tre_stack_t *s)
-{
- return s->stack[--s->ptr];
+#define define_pushf(typetag, type) \
+ declare_pushf(typetag, type) { \
+ union tre_stack_item item; \
+ item.typetag ## _value = value; \
+ return tre_stack_push(s, item); \
}
+define_pushf(int, int)
+define_pushf(voidptr, void *)
+
+#define define_popf(typetag, type) \
+ declare_popf(typetag, type) { \
+ return s->stack[--s->ptr].typetag ## _value; \
+ }
+
+define_popf(int, int)
+define_popf(voidptr, void *)
+
/***********************************************************************
from tre-parse.c and tre-parse.h
/* The parse result. */
tre_ast_node_t *result;
/* The regexp to parse and its length. */
- const tre_char_t *re;
+ const char *re;
/* The first character of the entire regexp. */
- const tre_char_t *re_start;
- /* The first character after the end of the regexp. */
- const tre_char_t *re_end;
- int len;
+ const char *re_start;
/* Current submatch ID. */
int submatch_id;
/* Current position (number of literal). */
int position;
/* The highest back reference or -1 if none seen so far. */
int max_backref;
+ /* This flag is set if the regexp uses approximate matching. */
+ int have_approx;
/* Compilation flags. */
int cflags;
/* If this flag is set the top-level submatch is not captured. */
int nofirstsub;
} tre_parse_ctx_t;
+/* Parses a wide character regexp pattern into a syntax tree. This parser
+ handles both syntaxes (BRE and ERE), including the TRE extensions. */
+static reg_errcode_t
+tre_parse(tre_parse_ctx_t *ctx);
+
+
+/*
+ This parser is just a simple recursive descent parser for POSIX.2
+ regexps. The parser supports both the obsolete default syntax and
+ the "extended" syntax, and some nonstandard extensions.
+*/
+
+/* Characters with special meanings in regexp syntax. */
+#define CHAR_PIPE '|'
+#define CHAR_LPAREN '('
+#define CHAR_RPAREN ')'
+#define CHAR_LBRACE '{'
+#define CHAR_RBRACE '}'
+#define CHAR_LBRACKET '['
+#define CHAR_RBRACKET ']'
+#define CHAR_MINUS '-'
+#define CHAR_STAR '*'
+#define CHAR_QUESTIONMARK '?'
+#define CHAR_PLUS '+'
+#define CHAR_PERIOD '.'
+#define CHAR_COLON ':'
+#define CHAR_EQUAL '='
+#define CHAR_COMMA ','
+#define CHAR_CARET '^'
+#define CHAR_DOLLAR '$'
+#define CHAR_BACKSLASH '\\'
+#define CHAR_HASH '#'
+#define CHAR_TILDE '~'
+
+
+/* Some macros for expanding \w, \s, etc. */
+static const struct tre_macro_struct {
+ const char c;
+ const char *expansion;
+} tre_macros[] =
+ { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
+ {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
+ {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
+ {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
+ { 0, NULL }
+ };
+
+
+/* Expands a macro delimited by `regex' and `regex_end' to `buf', which
+ must have at least `len' items. Sets buf[0] to zero if the there
+ is no match in `tre_macros'. */
+static const char *
+tre_expand_macro(const char *regex)
+{
+ int i;
+
+ if (!*regex)
+ return 0;
+
+ for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
+ return tre_macros[i].expansion;
+}
+
static reg_errcode_t
tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
tre_ast_node_t ***items)
if (*i >= *max_i)
{
tre_ast_node_t **new_items;
- DPRINT(("out of array space, i = %d\n", *i));
/* If the array is already 1024 items large, give up -- there's
probably an error in the regexp (e.g. not a '\0' terminated
string and missing ']') */
}
-/* Expands a character class to character ranges. */
-static reg_errcode_t
-tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items,
- int *i, int *max_i, int cflags)
-{
- reg_errcode_t status = REG_OK;
- tre_cint_t c;
- int j, min = -1, max = 0;
- assert(TRE_MB_CUR_MAX == 1);
-
- DPRINT((" expanding class to character ranges\n"));
- for (j = 0; (j < 256) && (status == REG_OK); j++)
- {
- c = j;
- if (tre_isctype(c, class)
- || ((cflags & REG_ICASE)
- && (tre_isctype(tre_tolower(c), class)
- || tre_isctype(tre_toupper(c), class))))
-{
- if (min < 0)
- min = c;
- max = c;
- }
- else if (min >= 0)
- {
- DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max));
- status = tre_new_item(mem, min, max, i, max_i, items);
- min = -1;
- }
- }
- if (min >= 0 && status == REG_OK)
- status = tre_new_item(mem, min, max, i, max_i, items);
- return status;
-}
-
-
static int
tre_compare_items(const void *a, const void *b)
{
- tre_ast_node_t *node_a = *(tre_ast_node_t **)a;
- tre_ast_node_t *node_b = *(tre_ast_node_t **)b;
+ const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
+ const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
int a_min = l_a->code_min, b_min = l_b->code_min;
tre_ast_node_t ***items, int *num_items,
int *items_size)
{
- const tre_char_t *re = ctx->re;
+ const char *re = ctx->re;
reg_errcode_t status = REG_OK;
tre_ctype_t class = (tre_ctype_t)0;
int i = *num_items;
while (status == REG_OK)
{
skip = 0;
- if (re == ctx->re_end)
+ if (!*re)
{
status = REG_EBRACK;
}
- else if (*re == ']' && re > ctx->re)
+ else if (*re == CHAR_RBRACKET && re > ctx->re)
{
- DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n",
- ctx->re_end - re, re));
re++;
break;
}
else
{
tre_cint_t min = 0, max = 0;
+ wchar_t wc;
+ int clen = mbtowc(&wc, re, -1);
+
+ if (clen<0) clen=1, wc=WEOF;
class = (tre_ctype_t)0;
- if (re + 2 < ctx->re_end
- && *(re + 1) == '-' && *(re + 2) != ']')
+ if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
{
- DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n",
- ctx->re_end - re, re));
- min = *re;
- max = *(re + 2);
- re += 3;
+ min = wc;
+ re += clen+1;
+ clen = mbtowc(&wc, re, -1);
+ if (clen<0) clen=1, wc=WEOF;
+ max = wc;
+ re += clen;
/* XXX - Should use collation order instead of encoding values
in character ranges. */
if (min > max)
status = REG_ERANGE;
}
- else if (re + 1 < ctx->re_end
- && *re == '[' && *(re + 1) == '.')
+ else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
status = REG_ECOLLATE;
- else if (re + 1 < ctx->re_end
- && *re == '[' && *(re + 1) == '=')
+ else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
status = REG_ECOLLATE;
- else if (re + 1 < ctx->re_end
- && *re == '[' && *(re + 1) == ':')
+ else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
{
char tmp_str[64];
- const tre_char_t *endptr = re + 2;
+ const char *endptr = re + 2;
int len;
- DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n",
- ctx->re_end - re, re));
- while (endptr < ctx->re_end && *endptr != ':')
+ while (*endptr && *endptr != CHAR_COLON)
endptr++;
- if (endptr != ctx->re_end)
+ if (*endptr)
{
len = MIN(endptr - re - 2, 63);
-#ifdef TRE_WCHAR
- {
- tre_char_t tmp_wcs[64];
- wcsncpy(tmp_wcs, re + 2, len);
- tmp_wcs[len] = '\0';
-#if defined HAVE_WCSRTOMBS
- {
- mbstate_t state;
- const tre_char_t *src = tmp_wcs;
- memset(&state, '\0', sizeof(state));
- len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
- }
-#elif defined HAVE_WCSTOMBS
- len = wcstombs(tmp_str, tmp_wcs, 63);
-#endif /* defined HAVE_WCSTOMBS */
- }
-#else /* !TRE_WCHAR */
strncpy(tmp_str, re + 2, len);
-#endif /* !TRE_WCHAR */
tmp_str[len] = '\0';
- DPRINT((" class name: %s\n", tmp_str));
class = tre_ctype(tmp_str);
if (!class)
status = REG_ECTYPE;
- /* Optimize character classes for 8 bit character sets. */
- if (status == REG_OK && TRE_MB_CUR_MAX == 1)
- {
- status = tre_expand_ctype(ctx->mem, class, items,
- &i, &max_i, ctx->cflags);
- class = (tre_ctype_t)0;
- skip = 1;
- }
re = endptr + 2;
}
else
}
else
{
- DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n",
- ctx->re_end - re, re));
- if (*re == '-' && *(re + 1) != ']'
+ if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
&& ctx->re != re)
/* Two ranges are not allowed to share and endpoint. */
status = REG_ERANGE;
- min = max = *re++;
+ min = max = wc;
+ re += clen;
}
if (status != REG_OK)
status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
if (status != REG_OK)
break;
- ((tre_literal_t*)((*items)[i-1])->obj)->class = class;
+ ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
}
/* Add opposite-case counterpoints if REG_ICASE is present.
This is broken if there are more than two "same" characters. */
if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
{
- int cmin, ccurr;
+ tre_cint_t cmin, ccurr;
- DPRINT(("adding opposite-case counterpoints\n"));
while (min <= max)
{
if (tre_islower(min))
if (items == NULL)
return REG_ESPACE;
- if (*ctx->re == '^')
+ if (*ctx->re == CHAR_CARET)
{
- DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n",
- ctx->re_end - ctx->re, ctx->re));
negate = 1;
ctx->re++;
}
/* Sort the array if we need to negate it. */
if (negate)
- qsort(items, i, sizeof(*items), tre_compare_items);
+ qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
curr_max = curr_min = 0;
/* Build a union of the items in the array, negated if necessary. */
min = l->code_min;
max = l->code_max;
- DPRINT(("item: %d - %d, class %ld, curr_max = %d\n",
- (int)l->code_min, (int)l->code_max, (long)l->class, curr_max));
-
if (negate)
{
if (min < curr_max)
{
/* Overlap. */
curr_max = MAX(max + 1, curr_max);
- DPRINT(("overlap, curr_max = %d\n", curr_max));
l = NULL;
}
else
curr_max = min - 1;
if (curr_max >= curr_min)
{
- DPRINT(("no overlap\n"));
l->code_min = curr_min;
l->code_max = curr_max;
}
else
{
- DPRINT(("no overlap, zero room\n"));
l = NULL;
}
curr_min = curr_max = max + 1;
if (l != NULL)
{
int k;
- DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max));
l->position = ctx->position;
if (num_neg_classes > 0)
{
if (negate)
{
int k;
- DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX));
n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
if (n == NULL)
status = REG_ESPACE;
/* Parses a positive decimal integer. Returns -1 if the string does not
contain a valid number. */
static int
-tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
+tre_parse_int(const char **regex)
{
int num = -1;
- const tre_char_t *r = *regex;
- while (r < regex_end && *r >= '0' && *r <= '9')
+ const char *r = *regex;
+ while (*r-'0'<10U)
{
if (num < 0)
num = 0;
tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
{
int min, max;
- const tre_char_t *r = ctx->re;
- const tre_char_t *start;
- int counts_set = 0;
+ const char *r = ctx->re;
+ int minimal = 0;
/* Parse number (minimum repetition count). */
min = -1;
- if (r < ctx->re_end && *r >= '0' && *r <= '9') {
- DPRINT(("tre_parse: min count: '%.*" STRF "'\n", ctx->re_end - r, r));
- min = tre_parse_int(&r, ctx->re_end);
+ if (*r >= '0' && *r <= '9') {
+ min = tre_parse_int(&r);
}
/* Parse comma and second number (maximum repetition count). */
max = min;
- if (r < ctx->re_end && *r == ',')
+ if (*r == CHAR_COMMA)
{
r++;
- DPRINT(("tre_parse: max count: '%.*" STRF "'\n", ctx->re_end - r, r));
- max = tre_parse_int(&r, ctx->re_end);
+ max = tre_parse_int(&r);
}
/* Check that the repeat counts are sane. */
if ((max >= 0 && min > max) || max > RE_DUP_MAX)
return REG_BADBR;
-
- /*
- '{'
- optionally followed immediately by a number == minimum repcount
- optionally followed by , then a number == maximum repcount
- */
-
-
- do {
- int done;
- start = r;
-
- /* Parse count limit settings */
- done = 0;
- if (!counts_set)
- while (r + 1 < ctx->re_end && !done)
- {
- switch (*r)
- {
- case ',':
- r++;
- break;
- case ' ':
- r++;
- break;
- case '}':
- done = 1;
- break;
- default:
- done = 1;
- break;
- }
- }
- } while (start != r);
-
/* Missing }. */
- if (r >= ctx->re_end)
+ if (!*r)
return REG_EBRACE;
/* Empty contents of {}. */
/* Parse the ending '}' or '\}'.*/
if (ctx->cflags & REG_EXTENDED)
{
- if (r >= ctx->re_end || *r != '}')
+ if (*r != CHAR_RBRACE)
return REG_BADBR;
r++;
}
else
{
- if (r + 1 >= ctx->re_end
- || *r != '\\'
- || *(r + 1) != '}')
+ if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
return REG_BADBR;
r += 2;
}
-
/* Create the AST node(s). */
if (min == 0 && max == 0)
{
/* Only approximate parameters set, no repetitions. */
min = max = 1;
- *result = tre_ast_new_iter(ctx->mem, *result, min, max);
+ *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
if (!*result)
return REG_ESPACE;
}
int bottom = tre_stack_num_objects(stack);
int depth = 0;
- DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n",
- ctx->len, ctx->re, ctx->len));
-
if (!ctx->nofirstsub)
{
- STACK_PUSH(stack, ctx->re);
- STACK_PUSH(stack, ctx->submatch_id);
- STACK_PUSH(stack, PARSE_MARK_FOR_SUBMATCH);
+ STACK_PUSH(stack, int, ctx->submatch_id);
+ STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
ctx->submatch_id++;
}
- STACK_PUSH(stack, PARSE_RE);
+ STACK_PUSH(stack, int, PARSE_RE);
ctx->re_start = ctx->re;
- ctx->re_end = ctx->re + ctx->len;
/* The following is basically just a recursive descent parser. I use
{
if (status != REG_OK)
break;
- symbol = (tre_parse_re_stack_symbol_t)tre_stack_pop(stack);
+ symbol = tre_stack_pop_int(stack);
switch (symbol)
{
case PARSE_RE:
/* Parse a full regexp. A regexp is one or more branches,
separated by the union operator `|'. */
if (ctx->cflags & REG_EXTENDED)
- STACK_PUSHX(stack, PARSE_UNION);
- STACK_PUSHX(stack, PARSE_BRANCH);
+ STACK_PUSHX(stack, int, PARSE_UNION);
+ STACK_PUSHX(stack, int, PARSE_BRANCH);
break;
case PARSE_BRANCH:
/* Parse a branch. A branch is one or more pieces, concatenated.
A piece is an atom possibly followed by a postfix operator. */
- STACK_PUSHX(stack, PARSE_CATENATION);
- STACK_PUSHX(stack, PARSE_PIECE);
+ STACK_PUSHX(stack, int, PARSE_CATENATION);
+ STACK_PUSHX(stack, int, PARSE_PIECE);
break;
case PARSE_PIECE:
/* Parse a piece. A piece is an atom possibly followed by one
or more postfix operators. */
- STACK_PUSHX(stack, PARSE_POSTFIX);
- STACK_PUSHX(stack, PARSE_ATOM);
+ STACK_PUSHX(stack, int, PARSE_POSTFIX);
+ STACK_PUSHX(stack, int, PARSE_ATOM);
break;
case PARSE_CATENATION:
/* If the expression has not ended, parse another piece. */
{
tre_char_t c;
- if (ctx->re >= ctx->re_end)
+ if (!*ctx->re)
break;
c = *ctx->re;
- if (ctx->cflags & REG_EXTENDED && c == '|')
- break;
- if ((ctx->cflags & REG_EXTENDED
- && c == ')' && depth > 0)
- || (!(ctx->cflags & REG_EXTENDED)
- && (c == '\\'
- && *(ctx->re + 1) == ')')))
+ if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
+ break;
+ if ((ctx->cflags & REG_EXTENDED
+ && c == CHAR_RPAREN && depth > 0)
+ || (!(ctx->cflags & REG_EXTENDED)
+ && (c == CHAR_BACKSLASH
+ && *(ctx->re + 1) == CHAR_RPAREN)))
+ {
+ if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
+ status = REG_EPAREN;
+ depth--;
+ if (!(ctx->cflags & REG_EXTENDED))
+ ctx->re += 2;
+ break;
+ }
+
{
- if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
- status = REG_EPAREN;
- DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
- ctx->re_end - ctx->re, ctx->re));
- depth--;
- if (!(ctx->cflags & REG_EXTENDED))
- ctx->re += 2;
- break;
+ /* Default case, left associative concatenation. */
+ STACK_PUSHX(stack, int, PARSE_CATENATION);
+ STACK_PUSHX(stack, voidptr, result);
+ STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
+ STACK_PUSHX(stack, int, PARSE_PIECE);
}
-
- /* Left associative concatenation. */
- STACK_PUSHX(stack, PARSE_CATENATION);
- STACK_PUSHX(stack, result);
- STACK_PUSHX(stack, PARSE_POST_CATENATION);
- STACK_PUSHX(stack, PARSE_PIECE);
break;
}
case PARSE_POST_CATENATION:
{
- tre_ast_node_t *tree = tre_stack_pop(stack);
+ tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
tre_ast_node_t *tmp_node;
tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
if (!tmp_node)
}
case PARSE_UNION:
- if (ctx->re >= ctx->re_end)
+ if (!*ctx->re)
break;
switch (*ctx->re)
{
- case '|':
- DPRINT(("tre_parse: union: '%.*" STRF "'\n",
- ctx->re_end - ctx->re, ctx->re));
- STACK_PUSHX(stack, PARSE_UNION);
- STACK_PUSHX(stack, result);
- STACK_PUSHX(stack, PARSE_POST_UNION);
- STACK_PUSHX(stack, PARSE_BRANCH);
+ case CHAR_PIPE:
+ STACK_PUSHX(stack, int, PARSE_UNION);
+ STACK_PUSHX(stack, voidptr, result);
+ STACK_PUSHX(stack, int, PARSE_POST_UNION);
+ STACK_PUSHX(stack, int, PARSE_BRANCH);
ctx->re++;
break;
- case ')':
+ case CHAR_RPAREN:
ctx->re++;
break;
case PARSE_POST_UNION:
{
tre_ast_node_t *tmp_node;
- tre_ast_node_t *tree = tre_stack_pop(stack);
+ tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
tmp_node = tre_ast_new_union(ctx->mem, tree, result);
if (!tmp_node)
return REG_ESPACE;
case PARSE_POSTFIX:
/* Parse postfix operators. */
- if (ctx->re >= ctx->re_end)
+ if (!*ctx->re)
break;
switch (*ctx->re)
{
- case '+':
- case '?':
+ case CHAR_PLUS:
+ case CHAR_QUESTIONMARK:
if (!(ctx->cflags & REG_EXTENDED))
- break;
- case '*':
+ break;
+ /*FALLTHROUGH*/
+ case CHAR_STAR:
{
tre_ast_node_t *tmp_node;
+ int minimal = 0;
int rep_min = 0;
int rep_max = -1;
- if (*ctx->re == '+')
+
+ if (*ctx->re == CHAR_PLUS)
rep_min = 1;
- if (*ctx->re == '?')
+ if (*ctx->re == CHAR_QUESTIONMARK)
rep_max = 1;
+ {
+ if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
+ {
+ minimal = 1;
+ ctx->re++;
+ }
+ else if (*(ctx->re + 1) == CHAR_STAR
+ || *(ctx->re + 1) == CHAR_PLUS)
+ {
+ /* These are reserved for future extensions. */
+ return REG_BADRPT;
+ }
+ }
+
ctx->re++;
- tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max);
+ tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
+ minimal);
if (tmp_node == NULL)
return REG_ESPACE;
result = tmp_node;
- STACK_PUSHX(stack, PARSE_POSTFIX);
- break;
+ STACK_PUSHX(stack, int, PARSE_POSTFIX);
}
+ break;
- case '\\':
+ case CHAR_BACKSLASH:
/* "\{" is special without REG_EXTENDED */
if (!(ctx->cflags & REG_EXTENDED)
- && ctx->re + 1 < ctx->re_end
- && *(ctx->re + 1) == '{')
+ && *(ctx->re + 1) == CHAR_LBRACE)
{
ctx->re++;
goto parse_brace;
else
break;
- case '{':
+ case CHAR_LBRACE:
/* "{" is literal without REG_EXTENDED */
if (!(ctx->cflags & REG_EXTENDED))
break;
parse_brace:
- DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
- ctx->re_end - ctx->re, ctx->re));
ctx->re++;
status = tre_parse_bound(ctx, &result);
if (status != REG_OK)
return status;
- STACK_PUSHX(stack, PARSE_POSTFIX);
+ STACK_PUSHX(stack, int, PARSE_POSTFIX);
break;
}
break;
a `\' followed by a character, or a single character. */
/* End of regexp? (empty string). */
- if (ctx->re >= ctx->re_end)
+ if (!*ctx->re)
goto parse_literal;
switch (*ctx->re)
{
- case '(': /* parenthesized subexpression */
+ case CHAR_LPAREN: /* parenthesized subexpression */
if (ctx->cflags & REG_EXTENDED
|| (ctx->re > ctx->re_start
- && *(ctx->re - 1) == '\\'))
+ && *(ctx->re - 1) == CHAR_BACKSLASH))
{
depth++;
{
- DPRINT(("tre_parse: group begin: '%.*" STRF
- "', submatch %d\n",
- ctx->re_end - ctx->re, ctx->re,
- ctx->submatch_id));
ctx->re++;
/* First parse a whole RE, then mark the resulting tree
for submatching. */
- STACK_PUSHX(stack, ctx->submatch_id);
- STACK_PUSHX(stack, PARSE_MARK_FOR_SUBMATCH);
- STACK_PUSHX(stack, PARSE_RE);
+ STACK_PUSHX(stack, int, ctx->submatch_id);
+ STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
+ STACK_PUSHX(stack, int, PARSE_RE);
ctx->submatch_id++;
}
}
goto parse_literal;
break;
- case ')': /* end of current subexpression */
+ case CHAR_RPAREN: /* end of current subexpression */
if ((ctx->cflags & REG_EXTENDED && depth > 0)
|| (ctx->re > ctx->re_start
- && *(ctx->re - 1) == '\\'))
+ && *(ctx->re - 1) == CHAR_BACKSLASH))
{
- DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
- ctx->re_end - ctx->re, ctx->re));
/* We were expecting an atom, but instead the current
subexpression was closed. POSIX leaves the meaning of
this to be implementation-defined. We interpret this as
goto parse_literal;
break;
- case '[': /* bracket expression */
- DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
- ctx->re_end - ctx->re, ctx->re));
+ case CHAR_LBRACKET: /* bracket expression */
ctx->re++;
status = tre_parse_bracket(ctx, &result);
if (status != REG_OK)
return status;
break;
- case '\\':
+ case CHAR_BACKSLASH:
/* If this is "\(" or "\)" chew off the backslash and
try again. */
if (!(ctx->cflags & REG_EXTENDED)
- && ctx->re + 1 < ctx->re_end
- && (*(ctx->re + 1) == '('
- || *(ctx->re + 1) == ')'))
+ && (*(ctx->re + 1) == CHAR_LPAREN
+ || *(ctx->re + 1) == CHAR_RPAREN))
{
ctx->re++;
- STACK_PUSHX(stack, PARSE_ATOM);
+ STACK_PUSHX(stack, int, PARSE_ATOM);
break;
}
- if (ctx->re + 1 >= ctx->re_end)
+ /* If a macro is used, parse the expanded macro recursively. */
+ {
+ const char *buf = tre_expand_macro(ctx->re + 1);
+ if (buf)
+ {
+ tre_parse_ctx_t subctx;
+ memcpy(&subctx, ctx, sizeof(subctx));
+ subctx.re = buf;
+ subctx.nofirstsub = 1;
+ status = tre_parse(&subctx);
+ if (status != REG_OK)
+ return status;
+ ctx->re += 2;
+ ctx->position = subctx.position;
+ result = subctx.result;
+ break;
+ }
+ }
+
+ if (!*ctx->re)
/* Trailing backslash. */
return REG_EESCAPE;
- DPRINT(("tre_parse: bleep: '%.*" STRF "'\n",
- ctx->re_end - ctx->re, ctx->re));
ctx->re++;
switch (*ctx->re)
{
+ case 'b':
+ result = tre_ast_new_literal(ctx->mem, ASSERTION,
+ ASSERT_AT_WB, -1);
+ ctx->re++;
+ break;
+ case 'B':
+ result = tre_ast_new_literal(ctx->mem, ASSERTION,
+ ASSERT_AT_WB_NEG, -1);
+ ctx->re++;
+ break;
+ case '<':
+ result = tre_ast_new_literal(ctx->mem, ASSERTION,
+ ASSERT_AT_BOW, -1);
+ ctx->re++;
+ break;
+ case '>':
+ result = tre_ast_new_literal(ctx->mem, ASSERTION,
+ ASSERT_AT_EOW, -1);
+ ctx->re++;
+ break;
+ case 'x':
+ ctx->re++;
+ if (ctx->re[0] != CHAR_LBRACE)
+ {
+ /* 8 bit hex char. */
+ char tmp[3] = {0, 0, 0};
+ long val;
+
+ if (tre_isxdigit(ctx->re[0]))
+ {
+ tmp[0] = (char)ctx->re[0];
+ ctx->re++;
+ }
+ if (tre_isxdigit(ctx->re[0]))
+ {
+ tmp[1] = (char)ctx->re[0];
+ ctx->re++;
+ }
+ val = strtol(tmp, NULL, 16);
+ result = tre_ast_new_literal(ctx->mem, (int)val,
+ (int)val, ctx->position);
+ ctx->position++;
+ break;
+ }
+ else if (*ctx->re)
+ {
+ /* Wide char. */
+ char tmp[32];
+ long val;
+ int i = 0;
+ ctx->re++;
+ while (*ctx->re && i < sizeof tmp)
+ {
+ if (ctx->re[0] == CHAR_RBRACE)
+ break;
+ if (tre_isxdigit(ctx->re[0]))
+ {
+ tmp[i] = (char)ctx->re[0];
+ i++;
+ ctx->re++;
+ continue;
+ }
+ return REG_EBRACE;
+ }
+ ctx->re++;
+ tmp[i] = 0;
+ val = strtol(tmp, NULL, 16);
+ result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
+ ctx->position);
+ ctx->position++;
+ break;
+ }
+ /*FALLTHROUGH*/
+
default:
- if (!(ctx->cflags & REG_EXTENDED) && tre_isdigit(*ctx->re))
+ if (tre_isdigit(*ctx->re))
{
/* Back reference. */
int val = *ctx->re - '0';
- DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
- ctx->re_end - ctx->re + 1, ctx->re - 1));
result = tre_ast_new_literal(ctx->mem, BACKREF, val,
ctx->position);
if (result == NULL)
else
{
/* Escaped character. */
- DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
- ctx->re_end - ctx->re + 1, ctx->re - 1));
result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
ctx->position);
ctx->position++;
return REG_ESPACE;
break;
- case '.': /* the any-symbol */
- DPRINT(("tre_parse: any: '%.*" STRF "'\n",
- ctx->re_end - ctx->re, ctx->re));
+ case CHAR_PERIOD: /* the any-symbol */
if (ctx->cflags & REG_NEWLINE)
{
tre_ast_node_t *tmp1;
ctx->re++;
break;
- case '^': /* beginning of line assertion */
+ case CHAR_CARET: /* beginning of line assertion */
/* '^' has a special meaning everywhere in EREs, and in the
beginning of the RE and after \( is BREs. */
if (ctx->cflags & REG_EXTENDED
|| (ctx->re - 2 >= ctx->re_start
- && *(ctx->re - 2) == '\\'
- && *(ctx->re - 1) == '(')
+ && *(ctx->re - 2) == CHAR_BACKSLASH
+ && *(ctx->re - 1) == CHAR_LPAREN)
|| ctx->re == ctx->re_start)
{
- DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
- ctx->re_end - ctx->re, ctx->re));
result = tre_ast_new_literal(ctx->mem, ASSERTION,
ASSERT_AT_BOL, -1);
if (result == NULL)
goto parse_literal;
break;
- case '$': /* end of line assertion. */
+ case CHAR_DOLLAR: /* end of line assertion. */
/* '$' is special everywhere in EREs, and in the end of the
string and before \) is BREs. */
if (ctx->cflags & REG_EXTENDED
- || (ctx->re + 2 < ctx->re_end
- && *(ctx->re + 1) == '\\'
- && *(ctx->re + 2) == ')')
- || ctx->re + 1 == ctx->re_end)
+ || (*(ctx->re + 1) == CHAR_BACKSLASH
+ && *(ctx->re + 2) == CHAR_RPAREN)
+ || !*(ctx->re + 1))
{
- DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
- ctx->re_end - ctx->re, ctx->re));
result = tre_ast_new_literal(ctx->mem, ASSERTION,
ASSERT_AT_EOL, -1);
if (result == NULL)
regexp ends here, we interpret it as an empty expression
(which matches an empty string). */
if (
- (ctx->re >= ctx->re_end
- || *ctx->re == '*'
+ (!*ctx->re
+ || *ctx->re == CHAR_STAR
|| (ctx->cflags & REG_EXTENDED
- && (*ctx->re == '|'
- || *ctx->re == '{'
- || *ctx->re == '+'
- || *ctx->re == '?'))
+ && (*ctx->re == CHAR_PIPE
+ || *ctx->re == CHAR_LBRACE
+ || *ctx->re == CHAR_PLUS
+ || *ctx->re == CHAR_QUESTIONMARK))
/* Test for "\)" in BRE mode. */
|| (!(ctx->cflags & REG_EXTENDED)
- && ctx->re + 1 < ctx->re_end
- && *ctx->re == '\\'
- && *(ctx->re + 1) == '{')))
+ && !*(ctx->re + 1)
+ && *ctx->re == CHAR_BACKSLASH
+ && *(ctx->re + 1) == CHAR_LBRACE)))
{
- DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
- ctx->re_end - ctx->re, ctx->re));
result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
if (!result)
return REG_ESPACE;
break;
}
- DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
- ctx->re_end - ctx->re, ctx->re));
+ wchar_t wc;
+ int clen = mbtowc(&wc, ctx->re, -1);
+ if (clen<0) clen=1, wc=WEOF;
+
/* Note that we can't use an tre_isalpha() test here, since there
may be characters which are alphabetic but neither upper or
lower case. */
if (ctx->cflags & REG_ICASE
- && (tre_isupper(*ctx->re) || tre_islower(*ctx->re)))
+ && (tre_isupper(wc) || tre_islower(wc)))
{
tre_ast_node_t *tmp1;
tre_ast_node_t *tmp2;
that at least for multi-character collating elements there
could be several opposite-case counterpoints, but they
cannot be supported portably anyway. */
- tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re),
- tre_toupper(*ctx->re),
+ tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
+ tre_toupper(wc),
ctx->position);
if (!tmp1)
return REG_ESPACE;
- tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re),
- tre_tolower(*ctx->re),
+ tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
+ tre_tolower(wc),
ctx->position);
if (!tmp2)
return REG_ESPACE;
}
else
{
- result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
+ result = tre_ast_new_literal(ctx->mem, wc, wc,
ctx->position);
if (!result)
return REG_ESPACE;
}
ctx->position++;
- ctx->re++;
+ ctx->re += clen;
break;
}
break;
case PARSE_MARK_FOR_SUBMATCH:
{
- int submatch_id = (int)tre_stack_pop(stack);
+ int submatch_id = tre_stack_pop_int(stack);
if (result->submatch_id >= 0)
{
}
case PARSE_RESTORE_CFLAGS:
- ctx->cflags = (int)tre_stack_pop(stack);
+ ctx->cflags = tre_stack_pop_int(stack);
+ break;
+
+ default:
+ assert(0);
break;
}
}
}
+
/***********************************************************************
from tre-compile.c
***********************************************************************/
+
+/*
+ TODO:
+ - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
+ function calls.
+*/
+
/*
Algorithms to setup tags so that submatch addressing can be done.
*/
/* Inserts a catenation node to the root of the tree given in `node'.
As the left child a new tag with number `tag_id' to `node' is added,
and the right child is the old root. */
-/* OR */
+static reg_errcode_t
+tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
+{
+ tre_catenation_t *c;
+
+ c = tre_mem_alloc(mem, sizeof(*c));
+ if (c == NULL)
+ return REG_ESPACE;
+ c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
+ if (c->left == NULL)
+ return REG_ESPACE;
+ c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
+ if (c->right == NULL)
+ return REG_ESPACE;
+
+ c->right->obj = node->obj;
+ c->right->type = node->type;
+ c->right->nullable = -1;
+ c->right->submatch_id = -1;
+ c->right->firstpos = NULL;
+ c->right->lastpos = NULL;
+ c->right->num_tags = 0;
+ node->obj = c;
+ node->type = CATENATION;
+ return REG_OK;
+}
+
/* Inserts a catenation node to the root of the tree given in `node'.
As the right child a new tag with number `tag_id' to `node' is added,
and the left child is the old root. */
static reg_errcode_t
-tre_add_tag(tre_mem_t mem, tre_ast_node_t *node, int tag_id, int right)
+tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
{
tre_catenation_t *c;
- tre_ast_node_t *child_tag, *child_old;
-
- DPRINT(("add_tag_%s: tag %d\n", right ? "right" : "left", tag_id));
c = tre_mem_alloc(mem, sizeof(*c));
if (c == NULL)
return REG_ESPACE;
- child_tag = tre_ast_new_literal(mem, TAG, tag_id, -1);
- if (child_tag == NULL)
+ c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
+ if (c->right == NULL)
return REG_ESPACE;
- child_old = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
- if (child_old == NULL)
+ c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
+ if (c->left == NULL)
return REG_ESPACE;
- child_old->obj = node->obj;
- child_old->type = node->type;
- child_old->nullable = -1;
- child_old->submatch_id = -1;
- child_old->firstpos = NULL;
- child_old->lastpos = NULL;
- child_old->num_tags = 0;
+ c->left->obj = node->obj;
+ c->left->type = node->type;
+ c->left->nullable = -1;
+ c->left->submatch_id = -1;
+ c->left->firstpos = NULL;
+ c->left->lastpos = NULL;
+ c->left->num_tags = 0;
node->obj = c;
node->type = CATENATION;
-
- c->right = c->left = child_old;
- if (right) c->right = child_tag;
- else c->left = child_tag;
-
return REG_OK;
}
int next_tag;
} tre_tag_states_t;
+
+/* Go through `regset' and set submatch data for submatches that are
+ using this tag. */
+static void
+tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
+{
+ int i;
+
+ for (i = 0; regset[i] >= 0; i++)
+ {
+ int id = regset[i] / 2;
+ int start = !(regset[i] % 2);
+ if (start)
+ tnfa->submatch_data[id].so_tag = tag;
+ else
+ tnfa->submatch_data[id].eo_tag = tag;
+ }
+ regset[0] = -1;
+}
+
+
/* Adds tags to appropriate locations in the parse tree in `tree', so that
subexpressions marked for submatch addressing can be traced. */
static reg_errcode_t
int first_pass = (mem == NULL || tnfa == NULL);
int *regset, *orig_regset;
int num_tags = 0; /* Total number of tags. */
+ int num_minimals = 0; /* Number of special minimal tags. */
int tag = 0; /* The tag that is to be added next. */
int next_tag = 1; /* Next tag to use after this one. */
int *parents; /* Stack of submatches the current submatch is
contained in. */
+ int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
tre_tag_states_t *saved_states;
tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
if (!first_pass)
+ {
tnfa->end_tag = 0;
+ tnfa->minimal_tags[0] = -1;
+ }
regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
if (regset == NULL)
saved_states[i].tag = -1;
}
- STACK_PUSH(stack, node);
- STACK_PUSH(stack, ADDTAGS_RECURSE);
+ STACK_PUSH(stack, voidptr, node);
+ STACK_PUSH(stack, int, ADDTAGS_RECURSE);
while (tre_stack_num_objects(stack) > bottom)
{
if (status != REG_OK)
break;
- symbol = (tre_addtags_symbol_t)tre_stack_pop(stack);
+ symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
switch (symbol)
{
case ADDTAGS_SET_SUBMATCH_END:
{
- int id = (int)tre_stack_pop(stack);
+ int id = tre_stack_pop_int(stack);
int i;
/* Add end of this submatch to regset. */
}
case ADDTAGS_RECURSE:
- node = tre_stack_pop(stack);
+ node = tre_stack_pop_voidptr(stack);
if (node->submatch_id >= 0)
{
/* Add end of this submatch to regset after processing this
node. */
- STACK_PUSHX(stack, node->submatch_id);
- STACK_PUSHX(stack, ADDTAGS_SET_SUBMATCH_END);
+ STACK_PUSHX(stack, int, node->submatch_id);
+ STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
}
switch (node->type)
if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
{
int i;
- DPRINT(("Literal %d-%d\n",
- (int)lit->code_min, (int)lit->code_max));
if (regset[0] >= 0)
{
/* Regset is not empty, so add a tag before the
literal or backref. */
if (!first_pass)
{
- status = tre_add_tag(mem, node, tag, 0 /*left*/);
+ status = tre_add_tag_left(mem, node, tag);
tnfa->tag_directions[tag] = direction;
- /* Go through the regset and set submatch data for
- submatches that are using this tag. */
- for (i = 0; regset[i] >= 0; i++)
+ if (minimal_tag >= 0)
{
- int id = regset[i] >> 1;
- int start = !(regset[i] & 1);
- DPRINT((" Using tag %d for %s offset of "
- "submatch %d\n", tag,
- start ? "start" : "end", id));
- if (start)
- tnfa->submatch_data[id].so_tag = tag;
- else
- tnfa->submatch_data[id].eo_tag = tag;
+ for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
+ tnfa->minimal_tags[i] = tag;
+ tnfa->minimal_tags[i + 1] = minimal_tag;
+ tnfa->minimal_tags[i + 2] = -1;
+ minimal_tag = -1;
+ num_minimals++;
}
+ tre_purge_regset(regset, tnfa, tag);
}
else
{
- DPRINT((" num_tags = 1\n"));
node->num_tags = 1;
}
- DPRINT((" num_tags++\n"));
regset[0] = -1;
tag = next_tag;
num_tags++;
tre_ast_node_t *left = cat->left;
tre_ast_node_t *right = cat->right;
int reserved_tag = -1;
- DPRINT(("Catenation, next_tag = %d\n", next_tag));
/* After processing right child. */
- STACK_PUSHX(stack, node);
- STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_RIGHT);
+ STACK_PUSHX(stack, voidptr, node);
+ STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
/* Process right child. */
- STACK_PUSHX(stack, right);
- STACK_PUSHX(stack, ADDTAGS_RECURSE);
+ STACK_PUSHX(stack, voidptr, right);
+ STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
/* After processing left child. */
- STACK_PUSHX(stack, next_tag + left->num_tags);
- DPRINT((" Pushing %d for after left\n",
- next_tag + left->num_tags));
+ STACK_PUSHX(stack, int, next_tag + left->num_tags);
if (left->num_tags > 0 && right->num_tags > 0)
{
/* Reserve the next tag to the right child. */
- DPRINT((" Reserving next_tag %d to right child\n",
- next_tag));
reserved_tag = next_tag;
next_tag++;
}
- STACK_PUSHX(stack, reserved_tag);
- STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_LEFT);
+ STACK_PUSHX(stack, int, reserved_tag);
+ STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
/* Process left child. */
- STACK_PUSHX(stack, left);
- STACK_PUSHX(stack, ADDTAGS_RECURSE);
+ STACK_PUSHX(stack, voidptr, left);
+ STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
}
break;
case ITERATION:
{
tre_iteration_t *iter = node->obj;
- DPRINT(("Iteration\n"));
if (first_pass)
{
- STACK_PUSHX(stack, regset[0] >= 0);
+ STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
}
else
{
- STACK_PUSHX(stack, tag);
+ STACK_PUSHX(stack, int, tag);
+ STACK_PUSHX(stack, int, iter->minimal);
}
- STACK_PUSHX(stack, node);
- STACK_PUSHX(stack, ADDTAGS_AFTER_ITERATION);
+ STACK_PUSHX(stack, voidptr, node);
+ STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
- STACK_PUSHX(stack, iter->arg);
- STACK_PUSHX(stack, ADDTAGS_RECURSE);
+ STACK_PUSHX(stack, voidptr, iter->arg);
+ STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
/* Regset is not empty, so add a tag here. */
- if (regset[0] >= 0)
+ if (regset[0] >= 0 || iter->minimal)
{
if (!first_pass)
{
int i;
- status = tre_add_tag(mem, node, tag, 0 /*left*/);
- tnfa->tag_directions[tag] = direction;
- /* Go through the regset and set submatch data for
- submatches that are using this tag. */
- for (i = 0; regset[i] >= 0; i++)
+ status = tre_add_tag_left(mem, node, tag);
+ if (iter->minimal)
+ tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
+ else
+ tnfa->tag_directions[tag] = direction;
+ if (minimal_tag >= 0)
{
- int id = regset[i] >> 1;
- int start = !(regset[i] & 1);
- DPRINT((" Using tag %d for %s offset of "
- "submatch %d\n", tag,
- start ? "start" : "end", id));
- if (start)
- tnfa->submatch_data[id].so_tag = tag;
- else
- tnfa->submatch_data[id].eo_tag = tag;
+ for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
+ tnfa->minimal_tags[i] = tag;
+ tnfa->minimal_tags[i + 1] = minimal_tag;
+ tnfa->minimal_tags[i + 2] = -1;
+ minimal_tag = -1;
+ num_minimals++;
}
+ tre_purge_regset(regset, tnfa, tag);
}
- DPRINT((" num_tags++\n"));
regset[0] = -1;
tag = next_tag;
num_tags++;
right_tag = next_tag;
}
- DPRINT(("Union\n"));
-
/* After processing right child. */
- STACK_PUSHX(stack, right_tag);
- STACK_PUSHX(stack, left_tag);
- STACK_PUSHX(stack, regset);
- STACK_PUSHX(stack, regset[0] >= 0);
- STACK_PUSHX(stack, node);
- STACK_PUSHX(stack, right);
- STACK_PUSHX(stack, left);
- STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_RIGHT);
+ STACK_PUSHX(stack, int, right_tag);
+ STACK_PUSHX(stack, int, left_tag);
+ STACK_PUSHX(stack, voidptr, regset);
+ STACK_PUSHX(stack, int, regset[0] >= 0);
+ STACK_PUSHX(stack, voidptr, node);
+ STACK_PUSHX(stack, voidptr, right);
+ STACK_PUSHX(stack, voidptr, left);
+ STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
/* Process right child. */
- STACK_PUSHX(stack, right);
- STACK_PUSHX(stack, ADDTAGS_RECURSE);
+ STACK_PUSHX(stack, voidptr, right);
+ STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
/* After processing left child. */
- STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_LEFT);
+ STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
/* Process left child. */
- STACK_PUSHX(stack, left);
- STACK_PUSHX(stack, ADDTAGS_RECURSE);
+ STACK_PUSHX(stack, voidptr, left);
+ STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
/* Regset is not empty, so add a tag here. */
if (regset[0] >= 0)
if (!first_pass)
{
int i;
- status = tre_add_tag(mem, node, tag, 0 /*left*/);
+ status = tre_add_tag_left(mem, node, tag);
tnfa->tag_directions[tag] = direction;
- /* Go through the regset and set submatch data for
- submatches that are using this tag. */
- for (i = 0; regset[i] >= 0; i++)
+ if (minimal_tag >= 0)
{
- int id = regset[i] >> 1;
- int start = !(regset[i] & 1);
- DPRINT((" Using tag %d for %s offset of "
- "submatch %d\n", tag,
- start ? "start" : "end", id));
- if (start)
- tnfa->submatch_data[id].so_tag = tag;
- else
- tnfa->submatch_data[id].eo_tag = tag;
+ for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
+ tnfa->minimal_tags[i] = tag;
+ tnfa->minimal_tags[i + 1] = minimal_tag;
+ tnfa->minimal_tags[i + 2] = -1;
+ minimal_tag = -1;
+ num_minimals++;
}
+ tre_purge_regset(regset, tnfa, tag);
}
- DPRINT((" num_tags++\n"));
regset[0] = -1;
tag = next_tag;
num_tags++;
case ADDTAGS_AFTER_ITERATION:
{
+ int minimal = 0;
int enter_tag;
- node = tre_stack_pop(stack);
+ node = tre_stack_pop_voidptr(stack);
if (first_pass)
+ {
node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
- + (int)tre_stack_pop(stack);
+ + tre_stack_pop_int(stack);
+ minimal_tag = -1;
+ }
else
- enter_tag = (int)tre_stack_pop(stack);
+ {
+ minimal = tre_stack_pop_int(stack);
+ enter_tag = tre_stack_pop_int(stack);
+ if (minimal)
+ minimal_tag = enter_tag;
+ }
- DPRINT(("After iteration\n"));
- direction = TRE_TAG_MAXIMIZE;
+ if (!first_pass)
+ {
+ if (minimal)
+ direction = TRE_TAG_MINIMIZE;
+ else
+ direction = TRE_TAG_MAXIMIZE;
+ }
break;
}
case ADDTAGS_AFTER_CAT_LEFT:
{
- int new_tag = (int)tre_stack_pop(stack);
- next_tag = (int)tre_stack_pop(stack);
- DPRINT(("After cat left, tag = %d, next_tag = %d\n",
- tag, next_tag));
+ int new_tag = tre_stack_pop_int(stack);
+ next_tag = tre_stack_pop_int(stack);
if (new_tag >= 0)
{
- DPRINT((" Setting tag to %d\n", new_tag));
tag = new_tag;
}
break;
}
case ADDTAGS_AFTER_CAT_RIGHT:
- DPRINT(("After cat right\n"));
- node = tre_stack_pop(stack);
+ node = tre_stack_pop_voidptr(stack);
if (first_pass)
node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
+ ((tre_catenation_t *)node->obj)->right->num_tags;
break;
case ADDTAGS_AFTER_UNION_LEFT:
- DPRINT(("After union left\n"));
/* Lift the bottom of the `regset' array so that when processing
the right operand the items currently in the array are
invisible. The original bottom was saved at ADDTAGS_UNION and
case ADDTAGS_AFTER_UNION_RIGHT:
{
int added_tags, tag_left, tag_right;
- tre_ast_node_t *left = tre_stack_pop(stack);
- tre_ast_node_t *right = tre_stack_pop(stack);
- DPRINT(("After union right\n"));
- node = tre_stack_pop(stack);
- added_tags = (int)tre_stack_pop(stack);
+ tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
+ tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
+ node = tre_stack_pop_voidptr(stack);
+ added_tags = tre_stack_pop_int(stack);
if (first_pass)
{
node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
+ ((tre_union_t *)node->obj)->right->num_tags + added_tags
+ ((node->num_submatches > 0) ? 2 : 0);
}
- regset = tre_stack_pop(stack);
- tag_left = (int)tre_stack_pop(stack);
- tag_right = (int)tre_stack_pop(stack);
+ regset = tre_stack_pop_voidptr(stack);
+ tag_left = tre_stack_pop_int(stack);
+ tag_right = tre_stack_pop_int(stack);
/* Add tags after both children, the left child gets a smaller
tag than the right child. This guarantees that we prefer
{
if (!first_pass)
{
- status = tre_add_tag(mem, left, tag_left, 1 /*right*/);
- tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
- status = tre_add_tag(mem, right, tag_right, 1 /*right*/);
- tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
+ status = tre_add_tag_right(mem, left, tag_left);
+ tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
+ status = tre_add_tag_right(mem, right, tag_right);
+ tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
}
- DPRINT((" num_tags += 2\n"));
num_tags += 2;
}
direction = TRE_TAG_MAXIMIZE;
} /* end while(tre_stack_num_objects(stack) > bottom) */
if (!first_pass)
+ tre_purge_regset(regset, tnfa, tag);
+
+ if (!first_pass && minimal_tag >= 0)
{
int i;
- /* Go through the regset and set submatch data for
- submatches that are using this tag. */
- for (i = 0; regset[i] >= 0; i++)
- {
- int id = regset[i] >> 1;
- int start = !(regset[i] & 1);
- DPRINT((" Using tag %d for %s offset of "
- "submatch %d\n", num_tags,
- start ? "start" : "end", id));
- if (start)
- tnfa->submatch_data[id].so_tag = num_tags;
- else
- tnfa->submatch_data[id].eo_tag = num_tags;
- }
+ for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
+ tnfa->minimal_tags[i] = tag;
+ tnfa->minimal_tags[i + 1] = minimal_tag;
+ tnfa->minimal_tags[i + 2] = -1;
+ minimal_tag = -1;
+ num_minimals++;
}
- DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n",
- first_pass? "First pass" : "Second pass", num_tags));
-
assert(tree->num_tags == num_tags);
tnfa->end_tag = num_tags;
tnfa->num_tags = num_tags;
+ tnfa->num_minimals = num_minimals;
xfree(orig_regset);
xfree(parents);
xfree(saved_states);
tre_ast_node_t **result = copy;
tre_copyast_symbol_t symbol;
- STACK_PUSH(stack, ast);
- STACK_PUSH(stack, COPY_RECURSE);
+ STACK_PUSH(stack, voidptr, ast);
+ STACK_PUSH(stack, int, COPY_RECURSE);
while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
{
if (status != REG_OK)
break;
- symbol = (tre_copyast_symbol_t)tre_stack_pop(stack);
+ symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
switch (symbol)
{
case COPY_SET_RESULT_PTR:
- result = tre_stack_pop(stack);
+ result = tre_stack_pop_voidptr(stack);
break;
case COPY_RECURSE:
- node = tre_stack_pop(stack);
+ node = tre_stack_pop_voidptr(stack);
switch (node->type)
{
case LITERAL:
case UNION:
{
tre_union_t *uni = node->obj;
- tre_union_t *copy;
+ tre_union_t *tmp;
*result = tre_ast_new_union(mem, uni->left, uni->right);
if (*result == NULL)
{
status = REG_ESPACE;
break;
}
- copy = (*result)->obj;
- result = ©->left;
- STACK_PUSHX(stack, uni->right);
- STACK_PUSHX(stack, COPY_RECURSE);
- STACK_PUSHX(stack, ©->right);
- STACK_PUSHX(stack, COPY_SET_RESULT_PTR);
- STACK_PUSHX(stack, uni->left);
- STACK_PUSHX(stack, COPY_RECURSE);
+ tmp = (*result)->obj;
+ result = &tmp->left;
+ STACK_PUSHX(stack, voidptr, uni->right);
+ STACK_PUSHX(stack, int, COPY_RECURSE);
+ STACK_PUSHX(stack, voidptr, &tmp->right);
+ STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
+ STACK_PUSHX(stack, voidptr, uni->left);
+ STACK_PUSHX(stack, int, COPY_RECURSE);
break;
}
case CATENATION:
{
tre_catenation_t *cat = node->obj;
- tre_catenation_t *copy;
+ tre_catenation_t *tmp;
*result = tre_ast_new_catenation(mem, cat->left, cat->right);
if (*result == NULL)
{
status = REG_ESPACE;
break;
}
- copy = (*result)->obj;
- copy->left = NULL;
- copy->right = NULL;
- result = ©->left;
-
- STACK_PUSHX(stack, cat->right);
- STACK_PUSHX(stack, COPY_RECURSE);
- STACK_PUSHX(stack, ©->right);
- STACK_PUSHX(stack, COPY_SET_RESULT_PTR);
- STACK_PUSHX(stack, cat->left);
- STACK_PUSHX(stack, COPY_RECURSE);
+ tmp = (*result)->obj;
+ tmp->left = NULL;
+ tmp->right = NULL;
+ result = &tmp->left;
+
+ STACK_PUSHX(stack, voidptr, cat->right);
+ STACK_PUSHX(stack, int, COPY_RECURSE);
+ STACK_PUSHX(stack, voidptr, &tmp->right);
+ STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
+ STACK_PUSHX(stack, voidptr, cat->left);
+ STACK_PUSHX(stack, int, COPY_RECURSE);
break;
}
case ITERATION:
{
tre_iteration_t *iter = node->obj;
- STACK_PUSHX(stack, iter->arg);
- STACK_PUSHX(stack, COPY_RECURSE);
- *result = tre_ast_new_iter(mem, iter->arg, iter->min, iter->max);
+ STACK_PUSHX(stack, voidptr, iter->arg);
+ STACK_PUSHX(stack, int, COPY_RECURSE);
+ *result = tre_ast_new_iter(mem, iter->arg, iter->min,
+ iter->max, iter->minimal);
if (*result == NULL)
{
status = REG_ESPACE;
int pos_add = 0;
int pos_add_total = 0;
int max_pos = 0;
- /* Approximate parameter nesting level. */
int iter_depth = 0;
- STACK_PUSHR(stack, ast);
- STACK_PUSHR(stack, EXPAND_RECURSE);
+ STACK_PUSHR(stack, voidptr, ast);
+ STACK_PUSHR(stack, int, EXPAND_RECURSE);
while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
{
tre_ast_node_t *node;
if (status != REG_OK)
break;
- DPRINT(("pos_add %d\n", pos_add));
-
- symbol = (tre_expand_ast_symbol_t)tre_stack_pop(stack);
- node = tre_stack_pop(stack);
+ symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
+ node = tre_stack_pop_voidptr(stack);
switch (symbol)
{
case EXPAND_RECURSE:
case UNION:
{
tre_union_t *uni = node->obj;
- STACK_PUSHX(stack, uni->right);
- STACK_PUSHX(stack, EXPAND_RECURSE);
- STACK_PUSHX(stack, uni->left);
- STACK_PUSHX(stack, EXPAND_RECURSE);
+ STACK_PUSHX(stack, voidptr, uni->right);
+ STACK_PUSHX(stack, int, EXPAND_RECURSE);
+ STACK_PUSHX(stack, voidptr, uni->left);
+ STACK_PUSHX(stack, int, EXPAND_RECURSE);
break;
}
case CATENATION:
{
tre_catenation_t *cat = node->obj;
- STACK_PUSHX(stack, cat->right);
- STACK_PUSHX(stack, EXPAND_RECURSE);
- STACK_PUSHX(stack, cat->left);
- STACK_PUSHX(stack, EXPAND_RECURSE);
+ STACK_PUSHX(stack, voidptr, cat->right);
+ STACK_PUSHX(stack, int, EXPAND_RECURSE);
+ STACK_PUSHX(stack, voidptr, cat->left);
+ STACK_PUSHX(stack, int, EXPAND_RECURSE);
break;
}
case ITERATION:
{
tre_iteration_t *iter = node->obj;
- STACK_PUSHX(stack, pos_add);
- STACK_PUSHX(stack, node);
- STACK_PUSHX(stack, EXPAND_AFTER_ITER);
- STACK_PUSHX(stack, iter->arg);
- STACK_PUSHX(stack, EXPAND_RECURSE);
+ STACK_PUSHX(stack, int, pos_add);
+ STACK_PUSHX(stack, voidptr, node);
+ STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
+ STACK_PUSHX(stack, voidptr, iter->arg);
+ STACK_PUSHX(stack, int, EXPAND_RECURSE);
/* If we are going to expand this node at EXPAND_AFTER_ITER
then don't increase the `pos' fields of the nodes now, it
will get done when expanding. */
if (iter->min > 1 || iter->max > 1)
pos_add = 0;
iter_depth++;
- DPRINT(("iter\n"));
break;
}
default:
{
tre_iteration_t *iter = node->obj;
int pos_add_last;
- pos_add = (int)tre_stack_pop(stack);
+ pos_add = tre_stack_pop_int(stack);
pos_add_last = pos_add;
if (iter->min > 1 || iter->max > 1)
{
tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
- int i;
+ int j;
int pos_add_save = pos_add;
/* Create a catenated sequence of copies of the node. */
- for (i = 0; i < iter->min; i++)
+ for (j = 0; j < iter->min; j++)
{
tre_ast_node_t *copy;
/* Remove tags from all but the last copy. */
- int flags = ((i + 1 < iter->min)
+ int flags = ((j + 1 < iter->min)
? COPY_REMOVE_TAGS
: COPY_MAXIMIZE_FIRST_TAG);
- DPRINT((" pos_add %d\n", pos_add));
pos_add_save = pos_add;
status = tre_copy_ast(mem, stack, iter->arg, flags,
&pos_add, tag_directions, ©,
&pos_add, NULL, &seq2, &max_pos);
if (status != REG_OK)
return status;
- seq2 = tre_ast_new_iter(mem, seq2, 0, -1);
+ seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
if (seq2 == NULL)
return REG_ESPACE;
}
else
{
- for (i = iter->min; i < iter->max; i++)
+ for (j = iter->min; j < iter->max; j++)
{
tre_ast_node_t *tmp, *copy;
pos_add_save = pos_add;
if (max_pos > *position)
*position = max_pos;
-#ifdef TRE_DEBUG
- DPRINT(("Expanded AST:\n"));
- tre_ast_print(ast);
- DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
-#endif
-
return status;
}
set to the number of tags seen on the path. */
static reg_errcode_t
tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
- int *assertions, int *num_tags_seen)
+ int *assertions, int *params, int *num_tags_seen,
+ int *params_seen)
{
tre_literal_t *lit;
tre_union_t *uni;
if (num_tags_seen)
*num_tags_seen = 0;
- status = tre_stack_push(stack, node);
+ status = tre_stack_push_voidptr(stack, node);
/* Walk through the tree recursively. */
while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
{
- node = tre_stack_pop(stack);
+ node = tre_stack_pop_voidptr(stack);
switch (node->type)
{
right subexpression. */
uni = (tre_union_t *)node->obj;
if (uni->left->nullable)
- STACK_PUSHX(stack, uni->left)
+ STACK_PUSHX(stack, voidptr, uni->left)
else if (uni->right->nullable)
- STACK_PUSHX(stack, uni->right)
+ STACK_PUSHX(stack, voidptr, uni->right)
else
assert(0);
break;
cat = (tre_catenation_t *)node->obj;
assert(cat->left->nullable);
assert(cat->right->nullable);
- STACK_PUSHX(stack, cat->left);
- STACK_PUSHX(stack, cat->right);
+ STACK_PUSHX(stack, voidptr, cat->left);
+ STACK_PUSHX(stack, voidptr, cat->right);
break;
case ITERATION:
all, so we go through the argument if possible. */
iter = (tre_iteration_t *)node->obj;
if (iter->arg->nullable)
- STACK_PUSHX(stack, iter->arg);
+ STACK_PUSHX(stack, voidptr, iter->arg);
break;
default:
{
int bottom = tre_stack_num_objects(stack);
- STACK_PUSHR(stack, tree);
- STACK_PUSHR(stack, NFL_RECURSE);
+ STACK_PUSHR(stack, voidptr, tree);
+ STACK_PUSHR(stack, int, NFL_RECURSE);
while (tre_stack_num_objects(stack) > bottom)
{
tre_nfl_stack_symbol_t symbol;
tre_ast_node_t *node;
- symbol = (tre_nfl_stack_symbol_t) tre_stack_pop(stack);
- node = tre_stack_pop(stack);
+ symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
+ node = tre_stack_pop_voidptr(stack);
switch (symbol)
{
case NFL_RECURSE:
return REG_ESPACE;
node->lastpos = tre_set_one(mem, lit->position, 0,
TRE_CHAR_MAX, 0, NULL,
- lit->code_max);
+ (int)lit->code_max);
if (!node->lastpos)
return REG_ESPACE;
}
else if (lit->code_min < 0)
{
- /* Tags, empty strings and zero width assertions:
+ /* Tags, empty strings, params, and zero width assertions:
nullable = true, firstpos = {}, and lastpos = {}. */
node->nullable = 1;
node->firstpos = tre_set_empty(mem);
lastpos = {i}. */
node->nullable = 0;
node->firstpos =
- tre_set_one(mem, lit->position, lit->code_min,
- lit->code_max, 0, NULL, -1);
+ tre_set_one(mem, lit->position, (int)lit->code_min,
+ (int)lit->code_max, 0, NULL, -1);
if (!node->firstpos)
return REG_ESPACE;
node->lastpos = tre_set_one(mem, lit->position,
- lit->code_min, lit->code_max,
- lit->class, lit->neg_classes,
+ (int)lit->code_min,
+ (int)lit->code_max,
+ lit->u.class, lit->neg_classes,
-1);
if (!node->lastpos)
return REG_ESPACE;
case UNION:
/* Compute the attributes for the two subtrees, and after that
for this node. */
- STACK_PUSHR(stack, node);
- STACK_PUSHR(stack, NFL_POST_UNION);
- STACK_PUSHR(stack, ((tre_union_t *)node->obj)->right);
- STACK_PUSHR(stack, NFL_RECURSE);
- STACK_PUSHR(stack, ((tre_union_t *)node->obj)->left);
- STACK_PUSHR(stack, NFL_RECURSE);
+ STACK_PUSHR(stack, voidptr, node);
+ STACK_PUSHR(stack, int, NFL_POST_UNION);
+ STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
+ STACK_PUSHR(stack, int, NFL_RECURSE);
+ STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
+ STACK_PUSHR(stack, int, NFL_RECURSE);
break;
case CATENATION:
/* Compute the attributes for the two subtrees, and after that
for this node. */
- STACK_PUSHR(stack, node);
- STACK_PUSHR(stack, NFL_POST_CATENATION);
- STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->right);
- STACK_PUSHR(stack, NFL_RECURSE);
- STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->left);
- STACK_PUSHR(stack, NFL_RECURSE);
+ STACK_PUSHR(stack, voidptr, node);
+ STACK_PUSHR(stack, int, NFL_POST_CATENATION);
+ STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
+ STACK_PUSHR(stack, int, NFL_RECURSE);
+ STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
+ STACK_PUSHR(stack, int, NFL_RECURSE);
break;
case ITERATION:
/* Compute the attributes for the subtree, and after that for
this node. */
- STACK_PUSHR(stack, node);
- STACK_PUSHR(stack, NFL_POST_ITERATION);
- STACK_PUSHR(stack, ((tre_iteration_t *)node->obj)->arg);
- STACK_PUSHR(stack, NFL_RECURSE);
+ STACK_PUSHR(stack, voidptr, node);
+ STACK_PUSHR(stack, int, NFL_POST_ITERATION);
+ STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
+ STACK_PUSHR(stack, int, NFL_RECURSE);
break;
}
break; /* end case: NFL_RECURSE */
case NFL_POST_CATENATION:
{
- int num_tags, *tags, assertions;
+ int num_tags, *tags, assertions, params_seen;
+ int *params;
reg_errcode_t status;
tre_catenation_t *cat = node->obj;
node->nullable = cat->left->nullable && cat->right->nullable;
if (cat->left->nullable)
{
/* The left side matches the empty string. Make a first pass
- with tre_match_empty() to get the number of tags. */
+ with tre_match_empty() to get the number of tags and
+ parameters. */
status = tre_match_empty(stack, cat->left,
- NULL, NULL, &num_tags);
+ NULL, NULL, NULL, &num_tags,
+ ¶ms_seen);
if (status != REG_OK)
return status;
/* Allocate arrays for the tags and parameters. */
tags[0] = -1;
assertions = 0;
/* Second pass with tre_mach_empty() to get the list of
- tags. */
+ tags and parameters. */
status = tre_match_empty(stack, cat->left, tags,
- &assertions, NULL);
+ &assertions, params, NULL, NULL);
if (status != REG_OK)
{
xfree(tags);
if (cat->right->nullable)
{
/* The right side matches the empty string. Make a first pass
- with tre_match_empty() to get the number of tags. */
+ with tre_match_empty() to get the number of tags and
+ parameters. */
status = tre_match_empty(stack, cat->right,
- NULL, NULL, &num_tags);
+ NULL, NULL, NULL, &num_tags,
+ ¶ms_seen);
if (status != REG_OK)
return status;
/* Allocate arrays for the tags and parameters. */
tags[0] = -1;
assertions = 0;
/* Second pass with tre_mach_empty() to get the list of
- tags. */
+ tags and parameters. */
status = tre_match_empty(stack, cat->right, tags,
- &assertions, NULL);
+ &assertions, params, NULL, NULL);
if (status != REG_OK)
{
xfree(tags);
detect it here. */
if (trans->state_id == p2->position)
{
- DPRINT(("*"));
break;
}
#endif
trans->tags[l] = -1;
}
-
-#ifdef TRE_DEBUG
- {
- int *tags;
-
- DPRINT((" %2d -> %2d on %3d", p1->position, p2->position,
- p1->code_min));
- if (p1->code_max != p1->code_min)
- DPRINT(("-%3d", p1->code_max));
- tags = trans->tags;
- if (tags)
- {
- DPRINT((", tags ["));
- while (*tags >= 0)
- {
- DPRINT(("%d", *tags));
- tags++;
- if (*tags >= 0)
- DPRINT((","));
- }
- DPRINT(("]"));
- }
- if (trans->assertions)
- DPRINT((", assert %d", trans->assertions));
- if (trans->assertions & ASSERT_BACKREF)
- DPRINT((", backref %d", trans->u.backref));
- else if (trans->class)
- DPRINT((", class %ld", (long)trans->class));
- if (trans->neg_classes)
- DPRINT((", neg_classes %p", trans->neg_classes));
- DPRINT(("\n"));
- }
-#endif /* TRE_DEBUG */
p2++;
}
p1++;
}
-static void
-tre_free(regex_t *preg)
-{
- tre_tnfa_t *tnfa;
- unsigned int i;
- tre_tnfa_transition_t *trans;
-
- tnfa = (void *)preg->TRE_REGEX_T_FIELD;
- if (!tnfa)
- return;
-
- for (i = 0; i < tnfa->num_transitions; i++)
- if (tnfa->transitions[i].state)
- {
- if (tnfa->transitions[i].tags)
- xfree(tnfa->transitions[i].tags);
- if (tnfa->transitions[i].neg_classes)
- xfree(tnfa->transitions[i].neg_classes);
- }
- if (tnfa->transitions)
- xfree(tnfa->transitions);
-
- if (tnfa->initial)
- {
- for (trans = tnfa->initial; trans->state; trans++)
- {
- if (trans->tags)
- xfree(trans->tags);
- }
- xfree(tnfa->initial);
- }
-
- if (tnfa->submatch_data)
- {
- for (i = 0; i < tnfa->num_submatches; i++)
- if (tnfa->submatch_data[i].parents)
- xfree(tnfa->submatch_data[i].parents);
- xfree(tnfa->submatch_data);
- }
-
- if (tnfa->tag_directions)
- xfree(tnfa->tag_directions);
- xfree(tnfa);
-}
-
-
#define ERROR_EXIT(err) \
do \
{ \
errcode = err; \
- if (1) goto error_exit; \
+ if (/*CONSTCOND*/1) \
+ goto error_exit; \
} \
- while (0)
+ while (/*CONSTCOND*/0)
-static int
-tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
+int
+regcomp(regex_t *preg, const char *regex, int cflags)
{
tre_stack_t *stack;
tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
parse_ctx.mem = mem;
parse_ctx.stack = stack;
parse_ctx.re = regex;
- parse_ctx.len = n;
parse_ctx.cflags = cflags;
parse_ctx.max_backref = -1;
- DPRINT(("tre_compile: parsing '%.*" STRF "'\n", n, regex));
errcode = tre_parse(&parse_ctx);
if (errcode != REG_OK)
ERROR_EXIT(errcode);
- preg->re_nsub = preg->__nsub2 = parse_ctx.submatch_id - 1;
+ preg->re_nsub = parse_ctx.submatch_id - 1;
tree = parse_ctx.result;
+ /* Back references and approximate matching cannot currently be used
+ in the same regexp. */
+ if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
+ ERROR_EXIT(REG_BADPAT);
+
#ifdef TRE_DEBUG
tre_ast_print(tree);
#endif /* TRE_DEBUG */
if (tnfa == NULL)
ERROR_EXIT(REG_ESPACE);
tnfa->have_backrefs = parse_ctx.max_backref >= 0;
+ tnfa->have_approx = parse_ctx.have_approx;
tnfa->num_submatches = parse_ctx.submatch_id;
/* Set up tags for submatch addressing. If REG_NOSUB is set and the
regexp does not have back references, this can be skipped. */
if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
{
- DPRINT(("tre_compile: setting up tags\n"));
/* Figure out how many tags we will need. */
errcode = tre_add_tags(NULL, stack, tree, tnfa);
if (errcode != REG_OK)
ERROR_EXIT(errcode);
-#ifdef TRE_DEBUG
- tre_ast_print(tree);
-#endif /* TRE_DEBUG */
if (tnfa->num_tags > 0)
{
memset(tag_directions, -1,
sizeof(*tag_directions) * (tnfa->num_tags + 1));
}
+ tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
+ sizeof(tnfa->minimal_tags));
+ if (tnfa->minimal_tags == NULL)
+ ERROR_EXIT(REG_ESPACE);
- submatch_data = xcalloc(parse_ctx.submatch_id, sizeof(*submatch_data));
+ submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
+ sizeof(*submatch_data));
if (submatch_data == NULL)
ERROR_EXIT(REG_ESPACE);
tnfa->submatch_data = submatch_data;
if (errcode != REG_OK)
ERROR_EXIT(errcode);
-#ifdef TRE_DEBUG
- for (i = 0; i < parse_ctx.submatch_id; i++)
- DPRINT(("pmatch[%d] = {t%d, t%d}\n",
- i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
- for (i = 0; i < tnfa->num_tags; i++)
- DPRINT(("t%d is %s\n", i,
- tag_directions[i] == TRE_TAG_MINIMIZE ?
- "minimized" : "maximized"));
-#endif /* TRE_DEBUG */
}
/* Expand iteration nodes. */
errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
- tag_directions, NULL);
+ tag_directions, &tnfa->params_depth);
if (errcode != REG_OK)
ERROR_EXIT(errcode);
if (tree == NULL)
ERROR_EXIT(REG_ESPACE);
-#ifdef TRE_DEBUG
- tre_ast_print(tree);
- DPRINT(("Number of states: %d\n", parse_ctx.position));
-#endif /* TRE_DEBUG */
-
errcode = tre_compute_nfl(mem, stack, tree);
if (errcode != REG_OK)
ERROR_EXIT(errcode);
add += counts[i] + 1;
counts[i] = 0;
}
- transitions = xcalloc(add + 1, sizeof(*transitions));
+ transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
if (transitions == NULL)
ERROR_EXIT(REG_ESPACE);
tnfa->transitions = transitions;
tnfa->num_transitions = add;
- DPRINT(("Converting to TNFA:\n"));
errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
if (errcode != REG_OK)
ERROR_EXIT(errcode);
+ tnfa->firstpos_chars = NULL;
+
p = tree->firstpos;
i = 0;
while (p->position >= 0)
{
i++;
-
-#ifdef TRE_DEBUG
- {
- int *tags;
- DPRINT(("initial: %d", p->position));
- tags = p->tags;
- if (tags != NULL)
- {
- if (*tags >= 0)
- DPRINT(("/"));
- while (*tags >= 0)
- {
- DPRINT(("%d", *tags));
- tags++;
- if (*tags >= 0)
- DPRINT((","));
- }
- }
- DPRINT((", assert %d", p->assertions));
- DPRINT(("\n"));
- }
-#endif /* TRE_DEBUG */
-
p++;
}
- initial = xcalloc(i + 1, sizeof(tre_tnfa_transition_t));
+ initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
if (initial == NULL)
ERROR_EXIT(REG_ESPACE);
tnfa->initial = initial;
initial[i].state = transitions + offs[p->position];
initial[i].state_id = p->position;
initial[i].tags = NULL;
- /* Copy the arrays p->tags, they are allocated
+ /* Copy the arrays p->tags, and p->params, they are allocated
from a tre_mem object. */
if (p->tags)
{
tnfa->num_states = parse_ctx.position;
tnfa->cflags = cflags;
- DPRINT(("final state %p\n", (void *)tnfa->final));
-
tre_mem_destroy(mem);
tre_stack_destroy(stack);
xfree(counts);
if (offs != NULL)
xfree(offs);
preg->TRE_REGEX_T_FIELD = (void *)tnfa;
- tre_free(preg);
+ regfree(preg);
return errcode;
}
-/***********************************************************************
- from regcomp.c
-***********************************************************************/
-int
-regcomp(regex_t *preg, const char *regex, int cflags)
+
+void
+regfree(regex_t *preg)
{
- int ret;
- tre_char_t *wregex;
- size_t n = strlen(regex);
+ tre_tnfa_t *tnfa;
+ unsigned int i;
+ tre_tnfa_transition_t *trans;
- if (n+1 > SIZE_MAX/sizeof(tre_char_t))
- return REG_ESPACE;
- wregex = xmalloc(sizeof(tre_char_t) * (n + 1));
- if (wregex == NULL)
- return REG_ESPACE;
+ tnfa = (void *)preg->TRE_REGEX_T_FIELD;
+ if (!tnfa)
+ return;
- n = mbstowcs(wregex, regex, n+1);
- if (n == (size_t)-1) {
- xfree(wregex);
- return REG_BADPAT;
- }
+ for (i = 0; i < tnfa->num_transitions; i++)
+ if (tnfa->transitions[i].state)
+ {
+ if (tnfa->transitions[i].tags)
+ xfree(tnfa->transitions[i].tags);
+ if (tnfa->transitions[i].neg_classes)
+ xfree(tnfa->transitions[i].neg_classes);
+ }
+ if (tnfa->transitions)
+ xfree(tnfa->transitions);
- ret = tre_compile(preg, wregex, n, cflags);
- xfree(wregex);
+ if (tnfa->initial)
+ {
+ for (trans = tnfa->initial; trans->state; trans++)
+ {
+ if (trans->tags)
+ xfree(trans->tags);
+ }
+ xfree(tnfa->initial);
+ }
- return ret;
-}
+ if (tnfa->submatch_data)
+ {
+ for (i = 0; i < tnfa->num_submatches; i++)
+ if (tnfa->submatch_data[i].parents)
+ xfree(tnfa->submatch_data[i].parents);
+ xfree(tnfa->submatch_data);
+ }
-void
-regfree(regex_t *preg)
-{
- tre_free(preg);
+ if (tnfa->tag_directions)
+ xfree(tnfa->tag_directions);
+ if (tnfa->firstpos_chars)
+ xfree(tnfa->firstpos_chars);
+ if (tnfa->minimal_tags)
+ xfree(tnfa->minimal_tags);
+ xfree(tnfa);
}
-
-/* EOF */
/*
regexec.c - TRE POSIX compatible matching functions (and more).
- Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>.
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
str_byte += pos_add_next; \
} while (0)
+#define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum(c))
+
#define CHECK_ASSERTIONS(assertions) \
(((assertions & ASSERT_AT_BOL) \
&& (pos > 0 || reg_notbol) \
&& (prev_c != L'\n' || !reg_newline)) \
|| ((assertions & ASSERT_AT_EOL) \
&& (next_c != L'\0' || reg_noteol) \
- && (next_c != L'\n' || !reg_newline)))
+ && (next_c != L'\n' || !reg_newline)) \
+ || ((assertions & ASSERT_AT_BOW) \
+ && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \
+ || ((assertions & ASSERT_AT_EOW) \
+ && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \
+ || ((assertions & ASSERT_AT_WB) \
+ && (pos != 0 && next_c != L'\0' \
+ && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \
+ || ((assertions & ASSERT_AT_WB_NEG) \
+ && (pos == 0 || next_c == L'\0' \
+ || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
+
+#define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \
+ (((trans_i->assertions & ASSERT_CHAR_CLASS) \
+ && !(tnfa->cflags & REG_ICASE) \
+ && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class)) \
+ || ((trans_i->assertions & ASSERT_CHAR_CLASS) \
+ && (tnfa->cflags & REG_ICASE) \
+ && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class) \
+ && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \
+ || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) \
+ && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\
+ tnfa->cflags & REG_ICASE)))
+
+
+
/* Returns 1 if `t1' wins `t2', 0 otherwise. */
static int
static int
tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
{
- DPRINT(("neg_char_classes_test: %p, %d, %d\n", classes, wc, icase));
while (*classes != (tre_ctype_t)0)
if ((!icase && tre_isctype(wc, *classes))
|| (icase && (tre_isctype(tre_toupper(wc), *classes)
See `tre-match-backtrack.c'.
*/
-
typedef struct {
tre_tnfa_transition_t *state;
int *tags;
} tre_reach_pos_t;
-#ifdef TRE_DEBUG
-static void
-tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
-{
- int i;
-
- while (reach->state != NULL)
- {
- DPRINT((" %p", (void *)reach->state));
- if (num_tags > 0)
- {
- DPRINT(("/"));
- for (i = 0; i < num_tags; i++)
- {
- DPRINT(("%d:%d", i, reach->tags[i]));
- if (i < (num_tags-1))
- DPRINT((","));
- }
- }
- reach++;
- }
- DPRINT(("\n"));
-
-}
-#endif /* TRE_DEBUG */
-
static reg_errcode_t
-tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
- int *match_tags, int eflags, int *match_end_ofs)
+tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
+ int *match_tags, int eflags,
+ int *match_end_ofs)
{
/* State variables required by GET_NEXT_WCHAR. */
tre_char_t prev_c = 0, next_c = 0;
const char *str_byte = string;
int pos = -1;
- int pos_add_next = 1;
+ unsigned int pos_add_next = 1;
#ifdef TRE_MBSTATE
mbstate_t mbstate;
#endif /* TRE_MBSTATE */
+ (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
/* Allocate the memory. */
-#ifdef TRE_USE_ALLOCA
- buf = alloca(total_bytes);
-#else /* !TRE_USE_ALLOCA */
- buf = xmalloc(total_bytes);
-#endif /* !TRE_USE_ALLOCA */
+ buf = xmalloc((unsigned)total_bytes);
if (buf == NULL)
return REG_ESPACE;
- memset(buf, 0, total_bytes);
+ memset(buf, 0, (size_t)total_bytes);
/* Get the various pointers within tmp_buf (properly aligned). */
tmp_tags = (void *)buf;
GET_NEXT_WCHAR();
pos = 0;
- DPRINT(("length: %d\n", len));
- DPRINT(("pos:chr/code | states and tags\n"));
- DPRINT(("-------------+------------------------------------------------\n"));
-
reach_next_i = reach_next;
while (1)
{
/* If no match found yet, add the initial states to `reach_next'. */
if (match_eo < 0)
{
- DPRINT((" init >"));
trans_i = tnfa->initial;
while (trans_i->state != NULL)
{
if (trans_i->assertions
&& CHECK_ASSERTIONS(trans_i->assertions))
{
- DPRINT(("assertion failed\n"));
trans_i++;
continue;
}
- DPRINT((" %p", (void *)trans_i->state));
reach_next_i->state = trans_i->state;
for (i = 0; i < num_tags; i++)
reach_next_i->tags[i] = -1;
}
if (reach_next_i->state == tnfa->final)
{
- DPRINT((" found empty match\n"));
match_eo = pos;
new_match = 1;
for (i = 0; i < num_tags; i++)
}
trans_i++;
}
- DPRINT(("\n"));
reach_next_i->state = NULL;
}
else
GET_NEXT_WCHAR();
-#ifdef TRE_DEBUG
- DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
- tre_print_reach(tnfa, reach_next, num_tags);
- DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
- tre_print_reach(tnfa, reach_next, num_tags);
-#endif /* TRE_DEBUG */
-
/* Swap `reach' and `reach_next'. */
reach_i = reach;
reach = reach_next;
reach_next = reach_i;
+ /* For each state in `reach', weed out states that don't fulfill the
+ minimal matching conditions. */
+ if (tnfa->num_minimals && new_match)
+ {
+ new_match = 0;
+ reach_next_i = reach_next;
+ for (reach_i = reach; reach_i->state; reach_i++)
+ {
+ int skip = 0;
+ for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
+ {
+ int end = tnfa->minimal_tags[i];
+ int start = tnfa->minimal_tags[i + 1];
+ if (end >= num_tags)
+ {
+ skip = 1;
+ break;
+ }
+ else if (reach_i->tags[start] == match_tags[start]
+ && reach_i->tags[end] < match_tags[end])
+ {
+ skip = 1;
+ break;
+ }
+ }
+ if (!skip)
+ {
+ reach_next_i->state = reach_i->state;
+ tmp_iptr = reach_next_i->tags;
+ reach_next_i->tags = reach_i->tags;
+ reach_i->tags = tmp_iptr;
+ reach_next_i++;
+ }
+ }
+ reach_next_i->state = NULL;
+
+ /* Swap `reach' and `reach_next'. */
+ reach_i = reach;
+ reach = reach_next;
+ reach_next = reach_i;
+ }
+
/* For each state in `reach' see if there is a transition leaving with
the current input symbol to a state not yet in `reach_next', and
add the destination states to `reach_next'. */
for (trans_i = reach_i->state; trans_i->state; trans_i++)
{
/* Does this transition match the input symbol? */
- if (trans_i->code_min <= prev_c &&
- trans_i->code_max >= prev_c)
+ if (trans_i->code_min <= (tre_cint_t)prev_c &&
+ trans_i->code_max >= (tre_cint_t)prev_c)
{
if (trans_i->assertions
&& (CHECK_ASSERTIONS(trans_i->assertions)
- /* Handle character class transitions. */
- || ((trans_i->assertions & ASSERT_CHAR_CLASS)
- && !(tnfa->cflags & REG_ICASE)
- && !tre_isctype((tre_cint_t)prev_c,
- trans_i->u.class))
- || ((trans_i->assertions & ASSERT_CHAR_CLASS)
- && (tnfa->cflags & REG_ICASE)
- && (!tre_isctype(tre_tolower((tre_cint_t)prev_c),
- trans_i->u.class)
- && !tre_isctype(tre_toupper((tre_cint_t)prev_c),
- trans_i->u.class)))
- || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)
- && tre_neg_char_classes_match(trans_i->neg_classes,
- (tre_cint_t)prev_c,
- tnfa->cflags & REG_ICASE))))
+ || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
{
- DPRINT(("assertion failed\n"));
continue;
}
|| (num_tags > 0
&& reach_next_i->tags[0] <= match_tags[0])))
{
- DPRINT((" found match %p\n", trans_i->state));
match_eo = pos;
new_match = 1;
for (i = 0; i < num_tags; i++)
*reach_pos[trans_i->state_id].tags = tmp_tags;
if (trans_i->state == tnfa->final)
{
- DPRINT((" found better match\n"));
match_eo = pos;
new_match = 1;
for (i = 0; i < num_tags; i++)
reach_next_i->state = NULL;
}
- DPRINT(("match end offset = %d\n", match_eo));
-
-#ifndef TRE_USE_ALLOCA
if (buf)
xfree(buf);
-#endif /* !TRE_USE_ALLOCA */
*match_end_ofs = match_eo;
return match_eo >= 0 ? REG_OK : REG_NOMATCH;
}
+
/***********************************************************************
from tre-match-backtrack.c
***********************************************************************/
#define BT_STACK_MBSTATE_OUT
#endif /* !TRE_MBSTATE */
-
-#ifdef TRE_USE_ALLOCA
-#define tre_bt_mem_new tre_mem_newa
-#define tre_bt_mem_alloc tre_mem_alloca
-#define tre_bt_mem_destroy(obj) do { } while (0)
-#else /* !TRE_USE_ALLOCA */
#define tre_bt_mem_new tre_mem_new
#define tre_bt_mem_alloc tre_mem_alloc
#define tre_bt_mem_destroy tre_mem_destroy
-#endif /* !TRE_USE_ALLOCA */
#define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
static reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
- int len, int *match_tags,
- int eflags, int *match_end_ofs)
+ int *match_tags, int eflags, int *match_end_ofs)
{
/* State variables required by GET_NEXT_WCHAR. */
tre_char_t prev_c = 0, next_c = 0;
const char *str_byte = string;
int pos = 0;
- int pos_add_next = 1;
+ unsigned int pos_add_next = 1;
#ifdef TRE_MBSTATE
mbstate_t mbstate;
#endif /* TRE_MBSTATE */
mbstate_t mbstate_start;
#endif /* TRE_MBSTATE */
- /* Compilation flags for this regexp. */
- int cflags = tnfa->cflags;
-
/* End offset of best match so far, or -1 if no match found yet. */
int match_eo = -1;
/* Tag arrays. */
stack->prev = NULL;
stack->next = NULL;
-#ifdef TRE_USE_ALLOCA
- tags = alloca(sizeof(*tags) * tnfa->num_tags);
- pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches);
- states_seen = alloca(sizeof(*states_seen) * tnfa->num_states);
-#else /* !TRE_USE_ALLOCA */
- tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
- if (!tags)
+ if (tnfa->num_tags)
{
- ret = REG_ESPACE;
- goto error_exit;
+ tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
+ if (!tags)
+ {
+ ret = REG_ESPACE;
+ goto error_exit;
+ }
}
- pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
- if (!pmatch)
+ if (tnfa->num_submatches)
{
- ret = REG_ESPACE;
- goto error_exit;
+ pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
+ if (!pmatch)
+ {
+ ret = REG_ESPACE;
+ goto error_exit;
+ }
}
- states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
- if (!states_seen)
+ if (tnfa->num_states)
{
- ret = REG_ESPACE;
- goto error_exit;
+ states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
+ if (!states_seen)
+ {
+ ret = REG_ESPACE;
+ goto error_exit;
+ }
}
-#endif /* !TRE_USE_ALLOCA */
retry:
{
next_tags = NULL;
for (trans_i = tnfa->initial; trans_i->state; trans_i++)
{
- DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
{
- DPRINT(("assert failed\n"));
continue;
}
if (state == NULL)
else
{
/* Backtrack to this state. */
- DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
- BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
+ BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
trans_i->state_id, next_c, tags, mbstate);
{
int *tmp = trans_i->tags;
tags[*next_tags] = pos;
- DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
- DPRINT(("pos:chr/code | state and tags\n"));
- DPRINT(("-------------+------------------------------------------------\n"));
-
if (state == NULL)
goto backtrack;
while (1)
{
- tre_tnfa_transition_t *trans_i, *next_state;
+ tre_tnfa_transition_t *next_state;
int empty_br_match;
- DPRINT(("start loop\n"));
if (state == tnfa->final)
{
- DPRINT((" match found, %d %d\n", match_eo, pos));
if (match_eo < pos
|| (match_eo == pos
&& match_tags
{
int i;
/* This match wins the previous match. */
- DPRINT((" win previous\n"));
match_eo = pos;
if (match_tags)
for (i = 0; i < tnfa->num_tags; i++)
goto backtrack;
}
-#ifdef TRE_DEBUG
- DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
- state));
- {
- int i;
- for (i = 0; i < tnfa->num_tags; i++)
- DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
- DPRINT(("\n"));
- }
-#endif /* TRE_DEBUG */
-
/* Go to the next character in the input string. */
empty_br_match = 0;
trans_i = state;
int bt_len;
int result;
- DPRINT((" should match back reference %d\n", bt));
/* Get the substring we need to match against. Remember to
turn off REG_NOSUB temporarily. */
- tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & !REG_NOSUB,
+ tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
tnfa, tags, pos);
so = pmatch[bt].rm_so;
eo = pmatch[bt].rm_eo;
bt_len = eo - so;
- if (len < 0)
- {
- result = strncmp((char*)string + so, str_byte - 1, bt_len);
- }
- else if (len - pos < bt_len)
- result = 1;
- else
- result = memcmp((char*)string + so, str_byte - 1, bt_len);
+ result = strncmp((const char*)string + so, str_byte - 1,
+ (size_t)bt_len);
- /* We can ignore multibyte characters here because the backref
- string is already aligned at character boundaries. */
if (result == 0)
{
/* Back reference matched. Check for infinite loop. */
empty_br_match = 1;
if (empty_br_match && states_seen[trans_i->state_id])
{
- DPRINT((" avoid loop\n"));
goto backtrack;
}
/* Advance in input string and resync `prev_c', `next_c'
and pos. */
- DPRINT((" back reference matched\n"));
str_byte += bt_len - 1;
pos += bt_len - 1;
GET_NEXT_WCHAR();
- DPRINT((" pos now %d\n", pos));
}
else
{
- DPRINT((" back reference did not match\n"));
goto backtrack;
}
}
else
{
/* Check for end of string. */
- if (len < 0)
- {
- if (next_c == L'\0')
- goto backtrack;
- }
- else
- {
- if (pos >= len)
+ if (next_c == L'\0')
goto backtrack;
- }
/* Read the next character. */
GET_NEXT_WCHAR();
next_state = NULL;
for (trans_i = state; trans_i->state; trans_i++)
{
- DPRINT((" transition %d-%d (%c-%c) %d to %d\n",
- trans_i->code_min, trans_i->code_max,
- trans_i->code_min, trans_i->code_max,
- trans_i->assertions, trans_i->state_id));
- if (trans_i->code_min <= prev_c && trans_i->code_max >= prev_c)
+ if (trans_i->code_min <= (tre_cint_t)prev_c
+ && trans_i->code_max >= (tre_cint_t)prev_c)
{
if (trans_i->assertions
&& (CHECK_ASSERTIONS(trans_i->assertions)
- /* Handle character class transitions. */
- || ((trans_i->assertions & ASSERT_CHAR_CLASS)
- && !(cflags & REG_ICASE)
- && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class))
- || ((trans_i->assertions & ASSERT_CHAR_CLASS)
- && (cflags & REG_ICASE)
- && (!tre_isctype(tre_tolower((tre_cint_t)prev_c),
- trans_i->u.class)
- && !tre_isctype(tre_toupper((tre_cint_t)prev_c),
- trans_i->u.class)))
- || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)
- && tre_neg_char_classes_match(trans_i->neg_classes,
- (tre_cint_t)prev_c,
- cflags & REG_ICASE))))
+ || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
{
- DPRINT((" assertion failed\n"));
continue;
}
if (next_state == NULL)
{
/* First matching transition. */
- DPRINT((" Next state is %d\n", trans_i->state_id));
next_state = trans_i->state;
next_tags = trans_i->tags;
}
else
{
- /* Second mathing transition. We may need to backtrack here
+ /* Second matching transition. We may need to backtrack here
to take this transition instead of the first one, so we
push this transition in the backtracking stack so we can
jump back here if needed. */
- DPRINT((" saving state %d for backtracking\n",
- trans_i->state_id));
- BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
+ BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
trans_i->state_id, next_c, tags, mbstate);
{
int *tmp;
/* A matching transition was not found. Try to backtrack. */
if (stack->prev)
{
- DPRINT((" backtracking\n"));
if (stack->item.state->assertions & ASSERT_BACKREF)
{
- DPRINT((" states_seen[%d] = 0\n",
- stack->item.state_id));
states_seen[stack->item.state_id] = 0;
}
{
/* Try starting from a later position in the input string. */
/* Check for end of string. */
- if (len < 0)
- {
- if (next_c == L'\0')
- {
- DPRINT(("end of string.\n"));
- break;
- }
- }
- else
- {
- if (pos >= len)
+ if (next_c == L'\0')
{
- DPRINT(("end of string.\n"));
break;
}
- }
- DPRINT(("restarting from next start position\n"));
next_c = next_c_start;
#ifdef TRE_MBSTATE
mbstate = mbstate_start;
}
else
{
- DPRINT(("finished\n"));
break;
}
}
return ret;
}
-
/***********************************************************************
from regexec.c
***********************************************************************/
if (match_eo >= 0 && !(cflags & REG_NOSUB))
{
/* Construct submatch offsets from the tags. */
- DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
submatch_data = tnfa->submatch_data;
while (i < tnfa->num_submatches && i < nmatch)
{
if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
pmatch[i].rm_so = pmatch[i].rm_eo = -1;
- DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i,
- submatch_data[i].so_tag, pmatch[i].rm_so,
- submatch_data[i].eo_tag, pmatch[i].rm_eo));
i++;
}
/* Reset all submatches that are not within all of their parent
if (parents != NULL)
for (j = 0; parents[j] >= 0; j++)
{
- DPRINT(("pmatch[%d] parent %d\n", i, parents[j]));
if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
|| pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
pmatch[i].rm_so = pmatch[i].rm_eo = -1;
Wrapper functions for POSIX compatible regexp matching.
*/
-static int
-tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
+int
+regexec(const regex_t *preg, const char *string,
size_t nmatch, regmatch_t pmatch[], int eflags)
{
+ tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
reg_errcode_t status;
int *tags = NULL, eo;
if (tnfa->num_tags > 0 && nmatch > 0)
{
-#ifdef TRE_USE_ALLOCA
- tags = alloca(sizeof(*tags) * tnfa->num_tags);
-#else /* !TRE_USE_ALLOCA */
tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
-#endif /* !TRE_USE_ALLOCA */
if (tags == NULL)
return REG_ESPACE;
}
if (tnfa->have_backrefs)
{
/* The regex has back references, use the backtracking matcher. */
- status = tre_tnfa_run_backtrack(tnfa, string, len, tags, eflags, &eo);
+ status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
}
else
{
/* Exact matching, no back references, use the parallel matcher. */
- status = tre_tnfa_run_parallel(tnfa, string, len, tags, eflags, &eo);
+ status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
}
if (status == REG_OK)
/* A match was found, so fill the submatch registers. */
tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
-#ifndef TRE_USE_ALLOCA
if (tags)
xfree(tags);
-#endif /* !TRE_USE_ALLOCA */
return status;
}
-
-int
-regexec(const regex_t *preg, const char *str,
- size_t nmatch, regmatch_t pmatch[], int eflags)
-{
- return tre_match((void *)preg->TRE_REGEX_T_FIELD, str, -1,
- nmatch, pmatch, eflags);
-}
-
-/* EOF */