2 regcomp.c - TRE POSIX compatible regex compilation functions.
4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 /***********************************************************************
45 ***********************************************************************/
54 tre_ctype_t *neg_classes;
60 /***********************************************************************
61 from tre-ast.c and tre-ast.h
62 ***********************************************************************/
64 /* The different AST node types. */
72 /* Special subtypes of TRE_LITERAL. */
73 #define EMPTY -1 /* Empty leaf (denotes empty string). */
74 #define ASSERTION -2 /* Assertion leaf. */
75 #define TAG -3 /* Tag leaf. */
76 #define BACKREF -4 /* Back reference leaf. */
78 #define IS_SPECIAL(x) ((x)->code_min < 0)
79 #define IS_EMPTY(x) ((x)->code_min == EMPTY)
80 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
81 #define IS_TAG(x) ((x)->code_min == TAG)
82 #define IS_BACKREF(x) ((x)->code_min == BACKREF)
85 /* A generic AST node. All AST nodes consist of this node on the top
86 level with `obj' pointing to the actual content. */
88 tre_ast_type_t type; /* Type of the node. */
89 void *obj; /* Pointer to actual node. */
94 tre_pos_and_tags_t *firstpos;
95 tre_pos_and_tags_t *lastpos;
99 /* A "literal" node. These are created for assertions, back references,
100 tags, matching parameter settings, and all expressions that match one
110 tre_ctype_t *neg_classes;
113 /* A "catenation" node. These are created when two regexps are concatenated.
114 If there are more than one subexpressions in sequence, the `left' part
115 holds all but the last, and `right' part holds the last subexpression
116 (catenation is left associative). */
118 tre_ast_node_t *left;
119 tre_ast_node_t *right;
122 /* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
125 /* Subexpression to match. */
127 /* Minimum number of consecutive matches. */
129 /* Maximum number of consecutive matches. */
131 /* If 0, match as many characters as possible, if 1 match as few as
132 possible. Note that this does not always mean the same thing as
133 matching as many/few repetitions as possible. */
134 unsigned int minimal:1;
137 /* An "union" node. These are created for the "|" operator. */
139 tre_ast_node_t *left;
140 tre_ast_node_t *right;
143 static tre_ast_node_t *
144 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
146 static tre_ast_node_t *
147 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
149 static tre_ast_node_t *
150 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
153 static tre_ast_node_t *
154 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
156 static tre_ast_node_t *
157 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
158 tre_ast_node_t *right);
161 static tre_ast_node_t *
162 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
164 tre_ast_node_t *node;
166 node = tre_mem_calloc(mem, sizeof(*node));
169 node->obj = tre_mem_calloc(mem, size);
174 node->submatch_id = -1;
179 static tre_ast_node_t *
180 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
182 tre_ast_node_t *node;
185 node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
189 lit->code_min = code_min;
190 lit->code_max = code_max;
191 lit->position = position;
196 static tre_ast_node_t *
197 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
200 tre_ast_node_t *node;
201 tre_iteration_t *iter;
203 node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
210 iter->minimal = minimal;
211 node->num_submatches = arg->num_submatches;
216 static tre_ast_node_t *
217 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
219 tre_ast_node_t *node;
221 node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
224 ((tre_union_t *)node->obj)->left = left;
225 ((tre_union_t *)node->obj)->right = right;
226 node->num_submatches = left->num_submatches + right->num_submatches;
231 static tre_ast_node_t *
232 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
233 tre_ast_node_t *right)
235 tre_ast_node_t *node;
237 node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
240 ((tre_catenation_t *)node->obj)->left = left;
241 ((tre_catenation_t *)node->obj)->right = right;
242 node->num_submatches = left->num_submatches + right->num_submatches;
248 /***********************************************************************
249 from tre-stack.c and tre-stack.h
250 ***********************************************************************/
252 typedef struct tre_stack_rec tre_stack_t;
254 /* Creates a new stack object. `size' is initial size in bytes, `max_size'
255 is maximum size, and `increment' specifies how much more space will be
256 allocated with realloc() if all space gets used up. Returns the stack
257 object or NULL if out of memory. */
259 tre_stack_new(int size, int max_size, int increment);
261 /* Frees the stack object. */
263 tre_stack_destroy(tre_stack_t *s);
265 /* Returns the current number of objects in the stack. */
267 tre_stack_num_objects(tre_stack_t *s);
269 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
270 `value' on top of stack `s'. Returns REG_ESPACE if out of memory.
271 This tries to realloc() more space before failing if maximum size
272 has not yet been reached. Returns REG_OK if successful. */
273 #define declare_pushf(typetag, type) \
274 static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
276 declare_pushf(voidptr, void *);
277 declare_pushf(int, int);
279 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
280 element off of stack `s' and returns it. The stack must not be
282 #define declare_popf(typetag, type) \
283 static type tre_stack_pop_ ## typetag(tre_stack_t *s)
285 declare_popf(voidptr, void *);
286 declare_popf(int, int);
288 /* Just to save some typing. */
289 #define STACK_PUSH(s, typetag, value) \
292 status = tre_stack_push_ ## typetag(s, value); \
294 while (/*CONSTCOND*/0)
296 #define STACK_PUSHX(s, typetag, value) \
298 status = tre_stack_push_ ## typetag(s, value); \
299 if (status != REG_OK) \
303 #define STACK_PUSHR(s, typetag, value) \
305 reg_errcode_t _status; \
306 _status = tre_stack_push_ ## typetag(s, value); \
307 if (_status != REG_OK) \
311 union tre_stack_item {
316 struct tre_stack_rec {
321 union tre_stack_item *stack;
326 tre_stack_new(int size, int max_size, int increment)
330 s = xmalloc(sizeof(*s));
333 s->stack = xmalloc(sizeof(*s->stack) * size);
334 if (s->stack == NULL)
340 s->max_size = max_size;
341 s->increment = increment;
348 tre_stack_destroy(tre_stack_t *s)
355 tre_stack_num_objects(tre_stack_t *s)
361 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
363 if (s->ptr < s->size)
365 s->stack[s->ptr] = value;
370 if (s->size >= s->max_size)
376 union tre_stack_item *new_buffer;
378 new_size = s->size + s->increment;
379 if (new_size > s->max_size)
380 new_size = s->max_size;
381 new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
382 if (new_buffer == NULL)
386 assert(new_size > s->size);
388 s->stack = new_buffer;
389 tre_stack_push(s, value);
395 #define define_pushf(typetag, type) \
396 declare_pushf(typetag, type) { \
397 union tre_stack_item item; \
398 item.typetag ## _value = value; \
399 return tre_stack_push(s, item); \
402 define_pushf(int, int)
403 define_pushf(voidptr, void *)
405 #define define_popf(typetag, type) \
406 declare_popf(typetag, type) { \
407 return s->stack[--s->ptr].typetag ## _value; \
410 define_popf(int, int)
411 define_popf(voidptr, void *)
414 /***********************************************************************
415 from tre-parse.c and tre-parse.h
416 ***********************************************************************/
420 /* Memory allocator. The AST is allocated using this. */
422 /* Stack used for keeping track of regexp syntax. */
424 /* The parse result. */
425 tre_ast_node_t *result;
426 /* The regexp to parse and its length. */
428 /* The first character of the entire regexp. */
429 const char *re_start;
430 /* Current submatch ID. */
432 /* Current position (number of literal). */
434 /* The highest back reference or -1 if none seen so far. */
436 /* This flag is set if the regexp uses approximate matching. */
438 /* Compilation flags. */
440 /* If this flag is set the top-level submatch is not captured. */
444 /* Parses a wide character regexp pattern into a syntax tree. This parser
445 handles both syntaxes (BRE and ERE), including the TRE extensions. */
447 tre_parse(tre_parse_ctx_t *ctx);
451 This parser is just a simple recursive descent parser for POSIX.2
452 regexps. The parser supports both the obsolete default syntax and
453 the "extended" syntax, and some nonstandard extensions.
456 /* Characters with special meanings in regexp syntax. */
457 #define CHAR_PIPE '|'
458 #define CHAR_LPAREN '('
459 #define CHAR_RPAREN ')'
460 #define CHAR_LBRACE '{'
461 #define CHAR_RBRACE '}'
462 #define CHAR_LBRACKET '['
463 #define CHAR_RBRACKET ']'
464 #define CHAR_MINUS '-'
465 #define CHAR_STAR '*'
466 #define CHAR_QUESTIONMARK '?'
467 #define CHAR_PLUS '+'
468 #define CHAR_PERIOD '.'
469 #define CHAR_COLON ':'
470 #define CHAR_EQUAL '='
471 #define CHAR_COMMA ','
472 #define CHAR_CARET '^'
473 #define CHAR_DOLLAR '$'
474 #define CHAR_BACKSLASH '\\'
475 #define CHAR_HASH '#'
476 #define CHAR_TILDE '~'
479 /* Some macros for expanding \w, \s, etc. */
480 static const struct tre_macro_struct {
482 const char *expansion;
484 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
485 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
486 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
487 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
492 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
493 must have at least `len' items. Sets buf[0] to zero if the there
494 is no match in `tre_macros'. */
496 tre_expand_macro(const char *regex)
503 for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
504 return tre_macros[i].expansion;
508 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
509 tre_ast_node_t ***items)
511 reg_errcode_t status;
512 tre_ast_node_t **array = *items;
513 /* Allocate more space if necessary. */
516 tre_ast_node_t **new_items;
517 /* If the array is already 1024 items large, give up -- there's
518 probably an error in the regexp (e.g. not a '\0' terminated
519 string and missing ']') */
523 new_items = xrealloc(array, sizeof(*items) * *max_i);
524 if (new_items == NULL)
526 *items = array = new_items;
528 array[*i] = tre_ast_new_literal(mem, min, max, -1);
529 status = array[*i] == NULL ? REG_ESPACE : REG_OK;
536 tre_compare_items(const void *a, const void *b)
538 const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
539 const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
540 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
541 int a_min = l_a->code_min, b_min = l_b->code_min;
545 else if (a_min > b_min)
551 /* Maximum number of character classes that can occur in a negated bracket
553 #define MAX_NEG_CLASSES 64
555 /* Maximum length of character class names. */
556 #define MAX_CLASS_NAME
559 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
560 tre_ctype_t neg_classes[], int *num_neg_classes,
561 tre_ast_node_t ***items, int *num_items,
564 const char *re = ctx->re;
565 reg_errcode_t status = REG_OK;
566 tre_ctype_t class = (tre_ctype_t)0;
568 int max_i = *items_size;
571 /* Build an array of the items in the bracket expression. */
572 while (status == REG_OK)
579 else if (*re == CHAR_RBRACKET && re > ctx->re)
586 tre_cint_t min = 0, max = 0;
588 int clen = mbtowc(&wc, re, -1);
590 if (clen<0) clen=1, wc=WEOF;
592 class = (tre_ctype_t)0;
593 if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
597 clen = mbtowc(&wc, re, -1);
598 if (clen<0) clen=1, wc=WEOF;
601 /* XXX - Should use collation order instead of encoding values
602 in character ranges. */
606 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
607 status = REG_ECOLLATE;
608 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
609 status = REG_ECOLLATE;
610 else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
613 const char *endptr = re + 2;
615 while (*endptr && *endptr != CHAR_COLON)
619 len = MIN(endptr - re - 2, 63);
620 strncpy(tmp_str, re + 2, len);
622 class = tre_ctype(tmp_str);
634 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
636 /* Two ranges are not allowed to share and endpoint. */
642 if (status != REG_OK)
646 if (*num_neg_classes >= MAX_NEG_CLASSES)
649 neg_classes[(*num_neg_classes)++] = class;
652 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
653 if (status != REG_OK)
655 ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
658 /* Add opposite-case counterpoints if REG_ICASE is present.
659 This is broken if there are more than two "same" characters. */
660 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
662 tre_cint_t cmin, ccurr;
666 if (tre_islower(min))
668 cmin = ccurr = tre_toupper(min++);
669 while (tre_islower(min) && tre_toupper(min) == ccurr + 1
671 ccurr = tre_toupper(min++);
672 status = tre_new_item(ctx->mem, cmin, ccurr,
675 else if (tre_isupper(min))
677 cmin = ccurr = tre_tolower(min++);
678 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
680 ccurr = tre_tolower(min++);
681 status = tre_new_item(ctx->mem, cmin, ccurr,
685 if (status != REG_OK)
688 if (status != REG_OK)
700 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
702 tre_ast_node_t *node = NULL;
704 reg_errcode_t status = REG_OK;
705 tre_ast_node_t **items, *u, *n;
706 int i = 0, j, max_i = 32, curr_max, curr_min;
707 tre_ctype_t neg_classes[MAX_NEG_CLASSES];
708 int num_neg_classes = 0;
710 /* Start off with an array of `max_i' elements. */
711 items = xmalloc(sizeof(*items) * max_i);
715 if (*ctx->re == CHAR_CARET)
721 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
724 if (status != REG_OK)
725 goto parse_bracket_done;
727 /* Sort the array if we need to negate it. */
729 qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
731 curr_max = curr_min = 0;
732 /* Build a union of the items in the array, negated if necessary. */
733 for (j = 0; j < i && status == REG_OK; j++)
736 tre_literal_t *l = items[j]->obj;
745 curr_max = MAX(max + 1, curr_max);
752 if (curr_max >= curr_min)
754 l->code_min = curr_min;
755 l->code_max = curr_max;
761 curr_min = curr_max = max + 1;
768 l->position = ctx->position;
769 if (num_neg_classes > 0)
771 l->neg_classes = tre_mem_alloc(ctx->mem,
772 (sizeof(l->neg_classes)
773 * (num_neg_classes + 1)));
774 if (l->neg_classes == NULL)
779 for (k = 0; k < num_neg_classes; k++)
780 l->neg_classes[k] = neg_classes[k];
781 l->neg_classes[k] = (tre_ctype_t)0;
784 l->neg_classes = NULL;
789 u = tre_ast_new_union(ctx->mem, node, items[j]);
797 if (status != REG_OK)
798 goto parse_bracket_done;
803 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
808 tre_literal_t *l = n->obj;
809 if (num_neg_classes > 0)
811 l->neg_classes = tre_mem_alloc(ctx->mem,
812 (sizeof(l->neg_classes)
813 * (num_neg_classes + 1)));
814 if (l->neg_classes == NULL)
817 goto parse_bracket_done;
819 for (k = 0; k < num_neg_classes; k++)
820 l->neg_classes[k] = neg_classes[k];
821 l->neg_classes[k] = (tre_ctype_t)0;
824 l->neg_classes = NULL;
829 u = tre_ast_new_union(ctx->mem, node, n);
837 if (status != REG_OK)
838 goto parse_bracket_done;
842 #endif /* TRE_DEBUG */
852 /* Parses a positive decimal integer. Returns -1 if the string does not
853 contain a valid number. */
855 tre_parse_int(const char **regex)
858 const char *r = *regex;
863 num = num * 10 + *r - '0';
872 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
875 const char *r = ctx->re;
878 /* Parse number (minimum repetition count). */
880 if (*r >= '0' && *r <= '9') {
881 min = tre_parse_int(&r);
884 /* Parse comma and second number (maximum repetition count). */
886 if (*r == CHAR_COMMA)
889 max = tre_parse_int(&r);
892 /* Check that the repeat counts are sane. */
893 if ((max >= 0 && min > max) || max > RE_DUP_MAX)
900 /* Empty contents of {}. */
904 /* Parse the ending '}' or '\}'.*/
905 if (ctx->cflags & REG_EXTENDED)
907 if (*r != CHAR_RBRACE)
913 if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
918 /* Create the AST node(s). */
919 if (min == 0 && max == 0)
921 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
927 if (min < 0 && max < 0)
928 /* Only approximate parameters set, no repetitions. */
931 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
943 PARSE_MARK_FOR_SUBMATCH,
947 PARSE_POST_CATENATION,
952 } tre_parse_re_stack_symbol_t;
956 tre_parse(tre_parse_ctx_t *ctx)
958 tre_ast_node_t *result = NULL;
959 tre_parse_re_stack_symbol_t symbol;
960 reg_errcode_t status = REG_OK;
961 tre_stack_t *stack = ctx->stack;
962 int bottom = tre_stack_num_objects(stack);
965 if (!ctx->nofirstsub)
967 STACK_PUSH(stack, int, ctx->submatch_id);
968 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
971 STACK_PUSH(stack, int, PARSE_RE);
972 ctx->re_start = ctx->re;
975 /* The following is basically just a recursive descent parser. I use
976 an explicit stack instead of recursive functions mostly because of
977 two reasons: compatibility with systems which have an overflowable
978 call stack, and efficiency (both in lines of code and speed). */
979 while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
981 if (status != REG_OK)
983 symbol = tre_stack_pop_int(stack);
987 /* Parse a full regexp. A regexp is one or more branches,
988 separated by the union operator `|'. */
989 if (ctx->cflags & REG_EXTENDED)
990 STACK_PUSHX(stack, int, PARSE_UNION);
991 STACK_PUSHX(stack, int, PARSE_BRANCH);
995 /* Parse a branch. A branch is one or more pieces, concatenated.
996 A piece is an atom possibly followed by a postfix operator. */
997 STACK_PUSHX(stack, int, PARSE_CATENATION);
998 STACK_PUSHX(stack, int, PARSE_PIECE);
1002 /* Parse a piece. A piece is an atom possibly followed by one
1003 or more postfix operators. */
1004 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1005 STACK_PUSHX(stack, int, PARSE_ATOM);
1008 case PARSE_CATENATION:
1009 /* If the expression has not ended, parse another piece. */
1015 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1017 if ((ctx->cflags & REG_EXTENDED
1018 && c == CHAR_RPAREN && depth > 0)
1019 || (!(ctx->cflags & REG_EXTENDED)
1020 && (c == CHAR_BACKSLASH
1021 && *(ctx->re + 1) == CHAR_RPAREN)))
1023 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1024 status = REG_EPAREN;
1026 if (!(ctx->cflags & REG_EXTENDED))
1032 /* Default case, left associative concatenation. */
1033 STACK_PUSHX(stack, int, PARSE_CATENATION);
1034 STACK_PUSHX(stack, voidptr, result);
1035 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1036 STACK_PUSHX(stack, int, PARSE_PIECE);
1041 case PARSE_POST_CATENATION:
1043 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1044 tre_ast_node_t *tmp_node;
1045 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1058 STACK_PUSHX(stack, int, PARSE_UNION);
1059 STACK_PUSHX(stack, voidptr, result);
1060 STACK_PUSHX(stack, int, PARSE_POST_UNION);
1061 STACK_PUSHX(stack, int, PARSE_BRANCH);
1074 case PARSE_POST_UNION:
1076 tre_ast_node_t *tmp_node;
1077 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1078 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1086 /* Parse postfix operators. */
1092 case CHAR_QUESTIONMARK:
1093 if (!(ctx->cflags & REG_EXTENDED))
1098 tre_ast_node_t *tmp_node;
1103 if (*ctx->re == CHAR_PLUS)
1105 if (*ctx->re == CHAR_QUESTIONMARK)
1109 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1111 if (tmp_node == NULL)
1114 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1118 case CHAR_BACKSLASH:
1119 /* "\{" is special without REG_EXTENDED */
1120 if (!(ctx->cflags & REG_EXTENDED)
1121 && *(ctx->re + 1) == CHAR_LBRACE)
1130 /* "{" is literal without REG_EXTENDED */
1131 if (!(ctx->cflags & REG_EXTENDED))
1137 status = tre_parse_bound(ctx, &result);
1138 if (status != REG_OK)
1140 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1146 /* Parse an atom. An atom is a regular expression enclosed in `()',
1147 an empty set of `()', a bracket expression, `.', `^', `$',
1148 a `\' followed by a character, or a single character. */
1150 /* End of regexp? (empty string). */
1156 case CHAR_LPAREN: /* parenthesized subexpression */
1158 if (ctx->cflags & REG_EXTENDED
1159 || (ctx->re > ctx->re_start
1160 && *(ctx->re - 1) == CHAR_BACKSLASH))
1165 /* First parse a whole RE, then mark the resulting tree
1167 STACK_PUSHX(stack, int, ctx->submatch_id);
1168 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1169 STACK_PUSHX(stack, int, PARSE_RE);
1177 case CHAR_RPAREN: /* end of current subexpression */
1178 if ((ctx->cflags & REG_EXTENDED && depth > 0)
1179 || (ctx->re > ctx->re_start
1180 && *(ctx->re - 1) == CHAR_BACKSLASH))
1182 /* We were expecting an atom, but instead the current
1183 subexpression was closed. POSIX leaves the meaning of
1184 this to be implementation-defined. We interpret this as
1185 an empty expression (which matches an empty string). */
1186 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1189 if (!(ctx->cflags & REG_EXTENDED))
1196 case CHAR_LBRACKET: /* bracket expression */
1198 status = tre_parse_bracket(ctx, &result);
1199 if (status != REG_OK)
1203 case CHAR_BACKSLASH:
1204 /* If this is "\(" or "\)" chew off the backslash and
1206 if (!(ctx->cflags & REG_EXTENDED)
1207 && (*(ctx->re + 1) == CHAR_LPAREN
1208 || *(ctx->re + 1) == CHAR_RPAREN))
1211 STACK_PUSHX(stack, int, PARSE_ATOM);
1215 /* If a macro is used, parse the expanded macro recursively. */
1217 const char *buf = tre_expand_macro(ctx->re + 1);
1220 tre_parse_ctx_t subctx;
1221 memcpy(&subctx, ctx, sizeof(subctx));
1223 subctx.nofirstsub = 1;
1224 status = tre_parse(&subctx);
1225 if (status != REG_OK)
1228 ctx->position = subctx.position;
1229 result = subctx.result;
1235 /* Trailing backslash. */
1242 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1247 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1248 ASSERT_AT_WB_NEG, -1);
1252 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1257 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1263 if (ctx->re[0] != CHAR_LBRACE)
1265 /* 8 bit hex char. */
1266 char tmp[3] = {0, 0, 0};
1269 if (tre_isxdigit(ctx->re[0]))
1271 tmp[0] = (char)ctx->re[0];
1274 if (tre_isxdigit(ctx->re[0]))
1276 tmp[1] = (char)ctx->re[0];
1279 val = strtol(tmp, NULL, 16);
1280 result = tre_ast_new_literal(ctx->mem, (int)val,
1281 (int)val, ctx->position);
1292 while (*ctx->re && i < sizeof tmp)
1294 if (ctx->re[0] == CHAR_RBRACE)
1296 if (tre_isxdigit(ctx->re[0]))
1298 tmp[i] = (char)ctx->re[0];
1307 val = strtol(tmp, NULL, 16);
1308 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1316 if (tre_isdigit(*ctx->re))
1318 /* Back reference. */
1319 int val = *ctx->re - '0';
1320 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1325 ctx->max_backref = MAX(val, ctx->max_backref);
1330 /* Escaped character. */
1331 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1342 case CHAR_PERIOD: /* the any-symbol */
1343 if (ctx->cflags & REG_NEWLINE)
1345 tre_ast_node_t *tmp1;
1346 tre_ast_node_t *tmp2;
1347 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
1351 tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
1355 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1362 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1371 case CHAR_CARET: /* beginning of line assertion */
1372 /* '^' has a special meaning everywhere in EREs, and in the
1373 beginning of the RE and after \( is BREs. */
1374 if (ctx->cflags & REG_EXTENDED
1375 || (ctx->re - 2 >= ctx->re_start
1376 && *(ctx->re - 2) == CHAR_BACKSLASH
1377 && *(ctx->re - 1) == CHAR_LPAREN)
1378 || ctx->re == ctx->re_start)
1380 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1390 case CHAR_DOLLAR: /* end of line assertion. */
1391 /* '$' is special everywhere in EREs, and in the end of the
1392 string and before \) is BREs. */
1393 if (ctx->cflags & REG_EXTENDED
1394 || (*(ctx->re + 1) == CHAR_BACKSLASH
1395 && *(ctx->re + 2) == CHAR_RPAREN)
1398 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1411 /* We are expecting an atom. If the subexpression (or the whole
1412 regexp ends here, we interpret it as an empty expression
1413 (which matches an empty string). */
1416 || *ctx->re == CHAR_STAR
1417 || (ctx->cflags & REG_EXTENDED
1418 && (*ctx->re == CHAR_PIPE
1419 || *ctx->re == CHAR_LBRACE
1420 || *ctx->re == CHAR_PLUS
1421 || *ctx->re == CHAR_QUESTIONMARK))
1422 /* Test for "\)" in BRE mode. */
1423 || (!(ctx->cflags & REG_EXTENDED)
1425 && *ctx->re == CHAR_BACKSLASH
1426 && *(ctx->re + 1) == CHAR_LBRACE)))
1428 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1435 int clen = mbtowc(&wc, ctx->re, -1);
1436 if (clen<0) clen=1, wc=WEOF;
1438 /* Note that we can't use an tre_isalpha() test here, since there
1439 may be characters which are alphabetic but neither upper or
1441 if (ctx->cflags & REG_ICASE
1442 && (tre_isupper(wc) || tre_islower(wc)))
1444 tre_ast_node_t *tmp1;
1445 tre_ast_node_t *tmp2;
1447 /* XXX - Can there be more than one opposite-case
1448 counterpoints for some character in some locale? Or
1449 more than two characters which all should be regarded
1450 the same character if case is ignored? If yes, there
1451 does not seem to be a portable way to detect it. I guess
1452 that at least for multi-character collating elements there
1453 could be several opposite-case counterpoints, but they
1454 cannot be supported portably anyway. */
1455 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
1460 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
1465 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1471 result = tre_ast_new_literal(ctx->mem, wc, wc,
1482 case PARSE_MARK_FOR_SUBMATCH:
1484 int submatch_id = tre_stack_pop_int(stack);
1486 if (result->submatch_id >= 0)
1488 tre_ast_node_t *n, *tmp_node;
1489 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1492 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1493 if (tmp_node == NULL)
1495 tmp_node->num_submatches = result->num_submatches;
1498 result->submatch_id = submatch_id;
1499 result->num_submatches++;
1503 case PARSE_RESTORE_CFLAGS:
1504 ctx->cflags = tre_stack_pop_int(stack);
1513 /* Check for missing closing parentheses. */
1517 if (status == REG_OK)
1518 ctx->result = result;
1525 /***********************************************************************
1527 ***********************************************************************/
1532 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1537 Algorithms to setup tags so that submatch addressing can be done.
1541 /* Inserts a catenation node to the root of the tree given in `node'.
1542 As the left child a new tag with number `tag_id' to `node' is added,
1543 and the right child is the old root. */
1544 static reg_errcode_t
1545 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1547 tre_catenation_t *c;
1549 c = tre_mem_alloc(mem, sizeof(*c));
1552 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1553 if (c->left == NULL)
1555 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1556 if (c->right == NULL)
1559 c->right->obj = node->obj;
1560 c->right->type = node->type;
1561 c->right->nullable = -1;
1562 c->right->submatch_id = -1;
1563 c->right->firstpos = NULL;
1564 c->right->lastpos = NULL;
1565 c->right->num_tags = 0;
1567 node->type = CATENATION;
1571 /* Inserts a catenation node to the root of the tree given in `node'.
1572 As the right child a new tag with number `tag_id' to `node' is added,
1573 and the left child is the old root. */
1574 static reg_errcode_t
1575 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1577 tre_catenation_t *c;
1579 c = tre_mem_alloc(mem, sizeof(*c));
1582 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1583 if (c->right == NULL)
1585 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1586 if (c->left == NULL)
1589 c->left->obj = node->obj;
1590 c->left->type = node->type;
1591 c->left->nullable = -1;
1592 c->left->submatch_id = -1;
1593 c->left->firstpos = NULL;
1594 c->left->lastpos = NULL;
1595 c->left->num_tags = 0;
1597 node->type = CATENATION;
1603 ADDTAGS_AFTER_ITERATION,
1604 ADDTAGS_AFTER_UNION_LEFT,
1605 ADDTAGS_AFTER_UNION_RIGHT,
1606 ADDTAGS_AFTER_CAT_LEFT,
1607 ADDTAGS_AFTER_CAT_RIGHT,
1608 ADDTAGS_SET_SUBMATCH_END
1609 } tre_addtags_symbol_t;
1618 /* Go through `regset' and set submatch data for submatches that are
1621 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1625 for (i = 0; regset[i] >= 0; i++)
1627 int id = regset[i] / 2;
1628 int start = !(regset[i] % 2);
1630 tnfa->submatch_data[id].so_tag = tag;
1632 tnfa->submatch_data[id].eo_tag = tag;
1638 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1639 subexpressions marked for submatch addressing can be traced. */
1640 static reg_errcode_t
1641 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1644 reg_errcode_t status = REG_OK;
1645 tre_addtags_symbol_t symbol;
1646 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1647 int bottom = tre_stack_num_objects(stack);
1648 /* True for first pass (counting number of needed tags) */
1649 int first_pass = (mem == NULL || tnfa == NULL);
1650 int *regset, *orig_regset;
1651 int num_tags = 0; /* Total number of tags. */
1652 int num_minimals = 0; /* Number of special minimal tags. */
1653 int tag = 0; /* The tag that is to be added next. */
1654 int next_tag = 1; /* Next tag to use after this one. */
1655 int *parents; /* Stack of submatches the current submatch is
1657 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1658 tre_tag_states_t *saved_states;
1660 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1664 tnfa->minimal_tags[0] = -1;
1667 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1671 orig_regset = regset;
1673 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1674 if (parents == NULL)
1681 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1682 if (saved_states == NULL)
1691 for (i = 0; i <= tnfa->num_submatches; i++)
1692 saved_states[i].tag = -1;
1695 STACK_PUSH(stack, voidptr, node);
1696 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1698 while (tre_stack_num_objects(stack) > bottom)
1700 if (status != REG_OK)
1703 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1707 case ADDTAGS_SET_SUBMATCH_END:
1709 int id = tre_stack_pop_int(stack);
1712 /* Add end of this submatch to regset. */
1713 for (i = 0; regset[i] >= 0; i++);
1714 regset[i] = id * 2 + 1;
1717 /* Pop this submatch from the parents stack. */
1718 for (i = 0; parents[i] >= 0; i++);
1719 parents[i - 1] = -1;
1723 case ADDTAGS_RECURSE:
1724 node = tre_stack_pop_voidptr(stack);
1726 if (node->submatch_id >= 0)
1728 int id = node->submatch_id;
1732 /* Add start of this submatch to regset. */
1733 for (i = 0; regset[i] >= 0; i++);
1739 for (i = 0; parents[i] >= 0; i++);
1740 tnfa->submatch_data[id].parents = NULL;
1743 int *p = xmalloc(sizeof(*p) * (i + 1));
1746 status = REG_ESPACE;
1749 assert(tnfa->submatch_data[id].parents == NULL);
1750 tnfa->submatch_data[id].parents = p;
1751 for (i = 0; parents[i] >= 0; i++)
1757 /* Add end of this submatch to regset after processing this
1759 STACK_PUSHX(stack, int, node->submatch_id);
1760 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1767 tre_literal_t *lit = node->obj;
1769 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1774 /* Regset is not empty, so add a tag before the
1775 literal or backref. */
1778 status = tre_add_tag_left(mem, node, tag);
1779 tnfa->tag_directions[tag] = direction;
1780 if (minimal_tag >= 0)
1782 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1783 tnfa->minimal_tags[i] = tag;
1784 tnfa->minimal_tags[i + 1] = minimal_tag;
1785 tnfa->minimal_tags[i + 2] = -1;
1789 tre_purge_regset(regset, tnfa, tag);
1804 assert(!IS_TAG(lit));
1810 tre_catenation_t *cat = node->obj;
1811 tre_ast_node_t *left = cat->left;
1812 tre_ast_node_t *right = cat->right;
1813 int reserved_tag = -1;
1816 /* After processing right child. */
1817 STACK_PUSHX(stack, voidptr, node);
1818 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1820 /* Process right child. */
1821 STACK_PUSHX(stack, voidptr, right);
1822 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1824 /* After processing left child. */
1825 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1826 if (left->num_tags > 0 && right->num_tags > 0)
1828 /* Reserve the next tag to the right child. */
1829 reserved_tag = next_tag;
1832 STACK_PUSHX(stack, int, reserved_tag);
1833 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1835 /* Process left child. */
1836 STACK_PUSHX(stack, voidptr, left);
1837 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1843 tre_iteration_t *iter = node->obj;
1847 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1851 STACK_PUSHX(stack, int, tag);
1852 STACK_PUSHX(stack, int, iter->minimal);
1854 STACK_PUSHX(stack, voidptr, node);
1855 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1857 STACK_PUSHX(stack, voidptr, iter->arg);
1858 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1860 /* Regset is not empty, so add a tag here. */
1861 if (regset[0] >= 0 || iter->minimal)
1866 status = tre_add_tag_left(mem, node, tag);
1868 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1870 tnfa->tag_directions[tag] = direction;
1871 if (minimal_tag >= 0)
1873 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1874 tnfa->minimal_tags[i] = tag;
1875 tnfa->minimal_tags[i + 1] = minimal_tag;
1876 tnfa->minimal_tags[i + 2] = -1;
1880 tre_purge_regset(regset, tnfa, tag);
1888 direction = TRE_TAG_MINIMIZE;
1893 tre_union_t *uni = node->obj;
1894 tre_ast_node_t *left = uni->left;
1895 tre_ast_node_t *right = uni->right;
1901 left_tag = next_tag;
1902 right_tag = next_tag + 1;
1907 right_tag = next_tag;
1910 /* After processing right child. */
1911 STACK_PUSHX(stack, int, right_tag);
1912 STACK_PUSHX(stack, int, left_tag);
1913 STACK_PUSHX(stack, voidptr, regset);
1914 STACK_PUSHX(stack, int, regset[0] >= 0);
1915 STACK_PUSHX(stack, voidptr, node);
1916 STACK_PUSHX(stack, voidptr, right);
1917 STACK_PUSHX(stack, voidptr, left);
1918 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1920 /* Process right child. */
1921 STACK_PUSHX(stack, voidptr, right);
1922 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1924 /* After processing left child. */
1925 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1927 /* Process left child. */
1928 STACK_PUSHX(stack, voidptr, left);
1929 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1931 /* Regset is not empty, so add a tag here. */
1937 status = tre_add_tag_left(mem, node, tag);
1938 tnfa->tag_directions[tag] = direction;
1939 if (minimal_tag >= 0)
1941 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1942 tnfa->minimal_tags[i] = tag;
1943 tnfa->minimal_tags[i + 1] = minimal_tag;
1944 tnfa->minimal_tags[i + 2] = -1;
1948 tre_purge_regset(regset, tnfa, tag);
1957 if (node->num_submatches > 0)
1959 /* The next two tags are reserved for markers. */
1969 if (node->submatch_id >= 0)
1972 /* Push this submatch on the parents stack. */
1973 for (i = 0; parents[i] >= 0; i++);
1974 parents[i] = node->submatch_id;
1975 parents[i + 1] = -1;
1978 break; /* end case: ADDTAGS_RECURSE */
1980 case ADDTAGS_AFTER_ITERATION:
1984 node = tre_stack_pop_voidptr(stack);
1987 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1988 + tre_stack_pop_int(stack);
1993 minimal = tre_stack_pop_int(stack);
1994 enter_tag = tre_stack_pop_int(stack);
1996 minimal_tag = enter_tag;
2002 direction = TRE_TAG_MINIMIZE;
2004 direction = TRE_TAG_MAXIMIZE;
2009 case ADDTAGS_AFTER_CAT_LEFT:
2011 int new_tag = tre_stack_pop_int(stack);
2012 next_tag = tre_stack_pop_int(stack);
2020 case ADDTAGS_AFTER_CAT_RIGHT:
2021 node = tre_stack_pop_voidptr(stack);
2023 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
2024 + ((tre_catenation_t *)node->obj)->right->num_tags;
2027 case ADDTAGS_AFTER_UNION_LEFT:
2028 /* Lift the bottom of the `regset' array so that when processing
2029 the right operand the items currently in the array are
2030 invisible. The original bottom was saved at ADDTAGS_UNION and
2031 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
2032 while (*regset >= 0)
2036 case ADDTAGS_AFTER_UNION_RIGHT:
2038 int added_tags, tag_left, tag_right;
2039 tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
2040 tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
2041 node = tre_stack_pop_voidptr(stack);
2042 added_tags = tre_stack_pop_int(stack);
2045 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
2046 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
2047 + ((node->num_submatches > 0) ? 2 : 0);
2049 regset = tre_stack_pop_voidptr(stack);
2050 tag_left = tre_stack_pop_int(stack);
2051 tag_right = tre_stack_pop_int(stack);
2053 /* Add tags after both children, the left child gets a smaller
2054 tag than the right child. This guarantees that we prefer
2055 the left child over the right child. */
2056 /* XXX - This is not always necessary (if the children have
2057 tags which must be seen for every match of that child). */
2058 /* XXX - Check if this is the only place where tre_add_tag_right
2059 is used. If so, use tre_add_tag_left (putting the tag before
2060 the child as opposed after the child) and throw away
2061 tre_add_tag_right. */
2062 if (node->num_submatches > 0)
2066 status = tre_add_tag_right(mem, left, tag_left);
2067 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
2068 status = tre_add_tag_right(mem, right, tag_right);
2069 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
2073 direction = TRE_TAG_MAXIMIZE;
2081 } /* end switch(symbol) */
2082 } /* end while(tre_stack_num_objects(stack) > bottom) */
2085 tre_purge_regset(regset, tnfa, tag);
2087 if (!first_pass && minimal_tag >= 0)
2090 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
2091 tnfa->minimal_tags[i] = tag;
2092 tnfa->minimal_tags[i + 1] = minimal_tag;
2093 tnfa->minimal_tags[i + 2] = -1;
2098 assert(tree->num_tags == num_tags);
2099 tnfa->end_tag = num_tags;
2100 tnfa->num_tags = num_tags;
2101 tnfa->num_minimals = num_minimals;
2104 xfree(saved_states);
2111 AST to TNFA compilation routines.
2117 } tre_copyast_symbol_t;
2119 /* Flags for tre_copy_ast(). */
2120 #define COPY_REMOVE_TAGS 1
2121 #define COPY_MAXIMIZE_FIRST_TAG 2
2123 static reg_errcode_t
2124 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2125 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
2126 tre_ast_node_t **copy, int *max_pos)
2128 reg_errcode_t status = REG_OK;
2129 int bottom = tre_stack_num_objects(stack);
2132 tre_ast_node_t **result = copy;
2133 tre_copyast_symbol_t symbol;
2135 STACK_PUSH(stack, voidptr, ast);
2136 STACK_PUSH(stack, int, COPY_RECURSE);
2138 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2140 tre_ast_node_t *node;
2141 if (status != REG_OK)
2144 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
2147 case COPY_SET_RESULT_PTR:
2148 result = tre_stack_pop_voidptr(stack);
2151 node = tre_stack_pop_voidptr(stack);
2156 tre_literal_t *lit = node->obj;
2157 int pos = lit->position;
2158 int min = lit->code_min;
2159 int max = lit->code_max;
2160 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2162 /* XXX - e.g. [ab] has only one position but two
2163 nodes, so we are creating holes in the state space
2164 here. Not fatal, just wastes memory. */
2168 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
2170 /* Change this tag to empty. */
2174 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
2177 /* Maximize the first tag. */
2178 tag_directions[max] = TRE_TAG_MAXIMIZE;
2181 *result = tre_ast_new_literal(mem, min, max, pos);
2182 if (*result == NULL)
2183 status = REG_ESPACE;
2191 tre_union_t *uni = node->obj;
2193 *result = tre_ast_new_union(mem, uni->left, uni->right);
2194 if (*result == NULL)
2196 status = REG_ESPACE;
2199 tmp = (*result)->obj;
2200 result = &tmp->left;
2201 STACK_PUSHX(stack, voidptr, uni->right);
2202 STACK_PUSHX(stack, int, COPY_RECURSE);
2203 STACK_PUSHX(stack, voidptr, &tmp->right);
2204 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2205 STACK_PUSHX(stack, voidptr, uni->left);
2206 STACK_PUSHX(stack, int, COPY_RECURSE);
2211 tre_catenation_t *cat = node->obj;
2212 tre_catenation_t *tmp;
2213 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
2214 if (*result == NULL)
2216 status = REG_ESPACE;
2219 tmp = (*result)->obj;
2222 result = &tmp->left;
2224 STACK_PUSHX(stack, voidptr, cat->right);
2225 STACK_PUSHX(stack, int, COPY_RECURSE);
2226 STACK_PUSHX(stack, voidptr, &tmp->right);
2227 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2228 STACK_PUSHX(stack, voidptr, cat->left);
2229 STACK_PUSHX(stack, int, COPY_RECURSE);
2234 tre_iteration_t *iter = node->obj;
2235 STACK_PUSHX(stack, voidptr, iter->arg);
2236 STACK_PUSHX(stack, int, COPY_RECURSE);
2237 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
2238 iter->max, iter->minimal);
2239 if (*result == NULL)
2241 status = REG_ESPACE;
2244 iter = (*result)->obj;
2245 result = &iter->arg;
2255 *pos_add += num_copied;
2262 } tre_expand_ast_symbol_t;
2264 /* Expands each iteration node that has a finite nonzero minimum or maximum
2265 iteration count to a catenated sequence of copies of the node. */
2266 static reg_errcode_t
2267 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2268 int *position, tre_tag_direction_t *tag_directions,
2271 reg_errcode_t status = REG_OK;
2272 int bottom = tre_stack_num_objects(stack);
2274 int pos_add_total = 0;
2278 STACK_PUSHR(stack, voidptr, ast);
2279 STACK_PUSHR(stack, int, EXPAND_RECURSE);
2280 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2282 tre_ast_node_t *node;
2283 tre_expand_ast_symbol_t symbol;
2285 if (status != REG_OK)
2288 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
2289 node = tre_stack_pop_voidptr(stack);
2292 case EXPAND_RECURSE:
2297 tre_literal_t *lit= node->obj;
2298 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2300 lit->position += pos_add;
2301 if (lit->position > max_pos)
2302 max_pos = lit->position;
2308 tre_union_t *uni = node->obj;
2309 STACK_PUSHX(stack, voidptr, uni->right);
2310 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2311 STACK_PUSHX(stack, voidptr, uni->left);
2312 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2317 tre_catenation_t *cat = node->obj;
2318 STACK_PUSHX(stack, voidptr, cat->right);
2319 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2320 STACK_PUSHX(stack, voidptr, cat->left);
2321 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2326 tre_iteration_t *iter = node->obj;
2327 STACK_PUSHX(stack, int, pos_add);
2328 STACK_PUSHX(stack, voidptr, node);
2329 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
2330 STACK_PUSHX(stack, voidptr, iter->arg);
2331 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2332 /* If we are going to expand this node at EXPAND_AFTER_ITER
2333 then don't increase the `pos' fields of the nodes now, it
2334 will get done when expanding. */
2335 if (iter->min > 1 || iter->max > 1)
2345 case EXPAND_AFTER_ITER:
2347 tre_iteration_t *iter = node->obj;
2349 pos_add = tre_stack_pop_int(stack);
2350 pos_add_last = pos_add;
2351 if (iter->min > 1 || iter->max > 1)
2353 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
2355 int pos_add_save = pos_add;
2357 /* Create a catenated sequence of copies of the node. */
2358 for (j = 0; j < iter->min; j++)
2360 tre_ast_node_t *copy;
2361 /* Remove tags from all but the last copy. */
2362 int flags = ((j + 1 < iter->min)
2364 : COPY_MAXIMIZE_FIRST_TAG);
2365 pos_add_save = pos_add;
2366 status = tre_copy_ast(mem, stack, iter->arg, flags,
2367 &pos_add, tag_directions, ©,
2369 if (status != REG_OK)
2372 seq1 = tre_ast_new_catenation(mem, seq1, copy);
2379 if (iter->max == -1)
2381 /* No upper limit. */
2382 pos_add_save = pos_add;
2383 status = tre_copy_ast(mem, stack, iter->arg, 0,
2384 &pos_add, NULL, &seq2, &max_pos);
2385 if (status != REG_OK)
2387 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
2393 for (j = iter->min; j < iter->max; j++)
2395 tre_ast_node_t *tmp, *copy;
2396 pos_add_save = pos_add;
2397 status = tre_copy_ast(mem, stack, iter->arg, 0,
2398 &pos_add, NULL, ©, &max_pos);
2399 if (status != REG_OK)
2402 seq2 = tre_ast_new_catenation(mem, copy, seq2);
2407 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
2410 seq2 = tre_ast_new_union(mem, tmp, seq2);
2416 pos_add = pos_add_save;
2419 else if (seq2 != NULL)
2420 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
2423 node->obj = seq1->obj;
2424 node->type = seq1->type;
2428 pos_add_total += pos_add - pos_add_last;
2429 if (iter_depth == 0)
2430 pos_add = pos_add_total;
2440 *position += pos_add_total;
2442 /* `max_pos' should never be larger than `*position' if the above
2443 code works, but just an extra safeguard let's make sure
2444 `*position' is set large enough so enough memory will be
2445 allocated for the transition table. */
2446 if (max_pos > *position)
2447 *position = max_pos;
2452 static tre_pos_and_tags_t *
2453 tre_set_empty(tre_mem_t mem)
2455 tre_pos_and_tags_t *new_set;
2457 new_set = tre_mem_calloc(mem, sizeof(*new_set));
2458 if (new_set == NULL)
2461 new_set[0].position = -1;
2462 new_set[0].code_min = -1;
2463 new_set[0].code_max = -1;
2468 static tre_pos_and_tags_t *
2469 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2470 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2472 tre_pos_and_tags_t *new_set;
2474 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2475 if (new_set == NULL)
2478 new_set[0].position = position;
2479 new_set[0].code_min = code_min;
2480 new_set[0].code_max = code_max;
2481 new_set[0].class = class;
2482 new_set[0].neg_classes = neg_classes;
2483 new_set[0].backref = backref;
2484 new_set[1].position = -1;
2485 new_set[1].code_min = -1;
2486 new_set[1].code_max = -1;
2491 static tre_pos_and_tags_t *
2492 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2493 int *tags, int assertions)
2496 tre_pos_and_tags_t *new_set;
2500 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2501 for (s1 = 0; set1[s1].position >= 0; s1++);
2502 for (s2 = 0; set2[s2].position >= 0; s2++);
2503 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2507 for (s1 = 0; set1[s1].position >= 0; s1++)
2509 new_set[s1].position = set1[s1].position;
2510 new_set[s1].code_min = set1[s1].code_min;
2511 new_set[s1].code_max = set1[s1].code_max;
2512 new_set[s1].assertions = set1[s1].assertions | assertions;
2513 new_set[s1].class = set1[s1].class;
2514 new_set[s1].neg_classes = set1[s1].neg_classes;
2515 new_set[s1].backref = set1[s1].backref;
2516 if (set1[s1].tags == NULL && tags == NULL)
2517 new_set[s1].tags = NULL;
2520 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2521 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2522 * (i + num_tags + 1)));
2523 if (new_tags == NULL)
2525 for (j = 0; j < i; j++)
2526 new_tags[j] = set1[s1].tags[j];
2527 for (i = 0; i < num_tags; i++)
2528 new_tags[j + i] = tags[i];
2529 new_tags[j + i] = -1;
2530 new_set[s1].tags = new_tags;
2534 for (s2 = 0; set2[s2].position >= 0; s2++)
2536 new_set[s1 + s2].position = set2[s2].position;
2537 new_set[s1 + s2].code_min = set2[s2].code_min;
2538 new_set[s1 + s2].code_max = set2[s2].code_max;
2539 /* XXX - why not | assertions here as well? */
2540 new_set[s1 + s2].assertions = set2[s2].assertions;
2541 new_set[s1 + s2].class = set2[s2].class;
2542 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2543 new_set[s1 + s2].backref = set2[s2].backref;
2544 if (set2[s2].tags == NULL)
2545 new_set[s1 + s2].tags = NULL;
2548 for (i = 0; set2[s2].tags[i] >= 0; i++);
2549 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2550 if (new_tags == NULL)
2552 for (j = 0; j < i; j++)
2553 new_tags[j] = set2[s2].tags[j];
2555 new_set[s1 + s2].tags = new_tags;
2558 new_set[s1 + s2].position = -1;
2562 /* Finds the empty path through `node' which is the one that should be
2563 taken according to POSIX.2 rules, and adds the tags on that path to
2564 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
2565 set to the number of tags seen on the path. */
2566 static reg_errcode_t
2567 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2568 int *assertions, int *params, int *num_tags_seen,
2573 tre_catenation_t *cat;
2574 tre_iteration_t *iter;
2576 int bottom = tre_stack_num_objects(stack);
2577 reg_errcode_t status = REG_OK;
2581 status = tre_stack_push_voidptr(stack, node);
2583 /* Walk through the tree recursively. */
2584 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2586 node = tre_stack_pop_voidptr(stack);
2591 lit = (tre_literal_t *)node->obj;
2592 switch (lit->code_min)
2595 if (lit->code_max >= 0)
2599 /* Add the tag to `tags'. */
2600 for (i = 0; tags[i] >= 0; i++)
2601 if (tags[i] == lit->code_max)
2605 tags[i] = lit->code_max;
2614 assert(lit->code_max >= 1
2615 || lit->code_max <= ASSERT_LAST);
2616 if (assertions != NULL)
2617 *assertions |= lit->code_max;
2628 /* Subexpressions starting earlier take priority over ones
2629 starting later, so we prefer the left subexpression over the
2630 right subexpression. */
2631 uni = (tre_union_t *)node->obj;
2632 if (uni->left->nullable)
2633 STACK_PUSHX(stack, voidptr, uni->left)
2634 else if (uni->right->nullable)
2635 STACK_PUSHX(stack, voidptr, uni->right)
2641 /* The path must go through both children. */
2642 cat = (tre_catenation_t *)node->obj;
2643 assert(cat->left->nullable);
2644 assert(cat->right->nullable);
2645 STACK_PUSHX(stack, voidptr, cat->left);
2646 STACK_PUSHX(stack, voidptr, cat->right);
2650 /* A match with an empty string is preferred over no match at
2651 all, so we go through the argument if possible. */
2652 iter = (tre_iteration_t *)node->obj;
2653 if (iter->arg->nullable)
2654 STACK_PUSHX(stack, voidptr, iter->arg);
2670 NFL_POST_CATENATION,
2672 } tre_nfl_stack_symbol_t;
2675 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2676 the nodes of the AST `tree'. */
2677 static reg_errcode_t
2678 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2680 int bottom = tre_stack_num_objects(stack);
2682 STACK_PUSHR(stack, voidptr, tree);
2683 STACK_PUSHR(stack, int, NFL_RECURSE);
2685 while (tre_stack_num_objects(stack) > bottom)
2687 tre_nfl_stack_symbol_t symbol;
2688 tre_ast_node_t *node;
2690 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2691 node = tre_stack_pop_voidptr(stack);
2699 tre_literal_t *lit = (tre_literal_t *)node->obj;
2700 if (IS_BACKREF(lit))
2702 /* Back references: nullable = false, firstpos = {i},
2705 node->firstpos = tre_set_one(mem, lit->position, 0,
2706 TRE_CHAR_MAX, 0, NULL, -1);
2707 if (!node->firstpos)
2709 node->lastpos = tre_set_one(mem, lit->position, 0,
2710 TRE_CHAR_MAX, 0, NULL,
2711 (int)lit->code_max);
2715 else if (lit->code_min < 0)
2717 /* Tags, empty strings, params, and zero width assertions:
2718 nullable = true, firstpos = {}, and lastpos = {}. */
2720 node->firstpos = tre_set_empty(mem);
2721 if (!node->firstpos)
2723 node->lastpos = tre_set_empty(mem);
2729 /* Literal at position i: nullable = false, firstpos = {i},
2733 tre_set_one(mem, lit->position, (int)lit->code_min,
2734 (int)lit->code_max, 0, NULL, -1);
2735 if (!node->firstpos)
2737 node->lastpos = tre_set_one(mem, lit->position,
2740 lit->u.class, lit->neg_classes,
2749 /* Compute the attributes for the two subtrees, and after that
2751 STACK_PUSHR(stack, voidptr, node);
2752 STACK_PUSHR(stack, int, NFL_POST_UNION);
2753 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2754 STACK_PUSHR(stack, int, NFL_RECURSE);
2755 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2756 STACK_PUSHR(stack, int, NFL_RECURSE);
2760 /* Compute the attributes for the two subtrees, and after that
2762 STACK_PUSHR(stack, voidptr, node);
2763 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2764 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2765 STACK_PUSHR(stack, int, NFL_RECURSE);
2766 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2767 STACK_PUSHR(stack, int, NFL_RECURSE);
2771 /* Compute the attributes for the subtree, and after that for
2773 STACK_PUSHR(stack, voidptr, node);
2774 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2775 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2776 STACK_PUSHR(stack, int, NFL_RECURSE);
2779 break; /* end case: NFL_RECURSE */
2781 case NFL_POST_UNION:
2783 tre_union_t *uni = (tre_union_t *)node->obj;
2784 node->nullable = uni->left->nullable || uni->right->nullable;
2785 node->firstpos = tre_set_union(mem, uni->left->firstpos,
2786 uni->right->firstpos, NULL, 0);
2787 if (!node->firstpos)
2789 node->lastpos = tre_set_union(mem, uni->left->lastpos,
2790 uni->right->lastpos, NULL, 0);
2796 case NFL_POST_ITERATION:
2798 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2800 if (iter->min == 0 || iter->arg->nullable)
2804 node->firstpos = iter->arg->firstpos;
2805 node->lastpos = iter->arg->lastpos;
2809 case NFL_POST_CATENATION:
2811 int num_tags, *tags, assertions, params_seen;
2813 reg_errcode_t status;
2814 tre_catenation_t *cat = node->obj;
2815 node->nullable = cat->left->nullable && cat->right->nullable;
2817 /* Compute firstpos. */
2818 if (cat->left->nullable)
2820 /* The left side matches the empty string. Make a first pass
2821 with tre_match_empty() to get the number of tags and
2823 status = tre_match_empty(stack, cat->left,
2824 NULL, NULL, NULL, &num_tags,
2826 if (status != REG_OK)
2828 /* Allocate arrays for the tags and parameters. */
2829 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2834 /* Second pass with tre_mach_empty() to get the list of
2835 tags and parameters. */
2836 status = tre_match_empty(stack, cat->left, tags,
2837 &assertions, params, NULL, NULL);
2838 if (status != REG_OK)
2844 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2847 if (!node->firstpos)
2852 node->firstpos = cat->left->firstpos;
2855 /* Compute lastpos. */
2856 if (cat->right->nullable)
2858 /* The right side matches the empty string. Make a first pass
2859 with tre_match_empty() to get the number of tags and
2861 status = tre_match_empty(stack, cat->right,
2862 NULL, NULL, NULL, &num_tags,
2864 if (status != REG_OK)
2866 /* Allocate arrays for the tags and parameters. */
2867 tags = xmalloc(sizeof(int) * (num_tags + 1));
2872 /* Second pass with tre_mach_empty() to get the list of
2873 tags and parameters. */
2874 status = tre_match_empty(stack, cat->right, tags,
2875 &assertions, params, NULL, NULL);
2876 if (status != REG_OK)
2882 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2890 node->lastpos = cat->right->lastpos;
2905 /* Adds a transition from each position in `p1' to each position in `p2'. */
2906 static reg_errcode_t
2907 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2908 tre_tnfa_transition_t *transitions,
2909 int *counts, int *offs)
2911 tre_pos_and_tags_t *orig_p2 = p2;
2912 tre_tnfa_transition_t *trans;
2913 int i, j, k, l, dup, prev_p2_pos;
2915 if (transitions != NULL)
2916 while (p1->position >= 0)
2920 while (p2->position >= 0)
2922 /* Optimization: if this position was already handled, skip it. */
2923 if (p2->position == prev_p2_pos)
2928 prev_p2_pos = p2->position;
2929 /* Set `trans' to point to the next unused transition from
2930 position `p1->position'. */
2931 trans = transitions + offs[p1->position];
2932 while (trans->state != NULL)
2935 /* If we find a previous transition from `p1->position' to
2936 `p2->position', it is overwritten. This can happen only
2937 if there are nested loops in the regexp, like in "((a)*)*".
2938 In POSIX.2 repetition using the outer loop is always
2939 preferred over using the inner loop. Therefore the
2940 transition for the inner loop is useless and can be thrown
2942 /* XXX - The same position is used for all nodes in a bracket
2943 expression, so this optimization cannot be used (it will
2944 break bracket expressions) unless I figure out a way to
2946 if (trans->state_id == p2->position)
2954 if (trans->state == NULL)
2955 (trans + 1)->state = NULL;
2956 /* Use the character ranges, assertions, etc. from `p1' for
2957 the transition from `p1' to `p2'. */
2958 trans->code_min = p1->code_min;
2959 trans->code_max = p1->code_max;
2960 trans->state = transitions + offs[p2->position];
2961 trans->state_id = p2->position;
2962 trans->assertions = p1->assertions | p2->assertions
2963 | (p1->class ? ASSERT_CHAR_CLASS : 0)
2964 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2965 if (p1->backref >= 0)
2967 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2968 assert(p2->backref < 0);
2969 trans->u.backref = p1->backref;
2970 trans->assertions |= ASSERT_BACKREF;
2973 trans->u.class = p1->class;
2974 if (p1->neg_classes != NULL)
2976 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2977 trans->neg_classes =
2978 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2979 if (trans->neg_classes == NULL)
2981 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2982 trans->neg_classes[i] = p1->neg_classes[i];
2983 trans->neg_classes[i] = (tre_ctype_t)0;
2986 trans->neg_classes = NULL;
2988 /* Find out how many tags this transition has. */
2990 if (p1->tags != NULL)
2991 while(p1->tags[i] >= 0)
2994 if (p2->tags != NULL)
2995 while(p2->tags[j] >= 0)
2998 /* If we are overwriting a transition, free the old tag array. */
2999 if (trans->tags != NULL)
3003 /* If there were any tags, allocate an array and fill it. */
3006 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
3010 if (p1->tags != NULL)
3011 while(p1->tags[i] >= 0)
3013 trans->tags[i] = p1->tags[i];
3018 if (p2->tags != NULL)
3019 while (p2->tags[j] >= 0)
3021 /* Don't add duplicates. */
3023 for (k = 0; k < i; k++)
3024 if (trans->tags[k] == p2->tags[j])
3030 trans->tags[l++] = p2->tags[j];
3033 trans->tags[l] = -1;
3041 /* Compute a maximum limit for the number of transitions leaving
3043 while (p1->position >= 0)
3046 while (p2->position >= 0)
3048 counts[p1->position]++;
3056 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
3057 labelled with one character range (there are no transitions on empty
3058 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
3060 static reg_errcode_t
3061 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
3062 int *counts, int *offs)
3065 tre_catenation_t *cat;
3066 tre_iteration_t *iter;
3067 reg_errcode_t errcode = REG_OK;
3069 /* XXX - recurse using a stack!. */
3075 uni = (tre_union_t *)node->obj;
3076 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
3077 if (errcode != REG_OK)
3079 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
3083 cat = (tre_catenation_t *)node->obj;
3084 /* Add a transition from each position in cat->left->lastpos
3085 to each position in cat->right->firstpos. */
3086 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
3087 transitions, counts, offs);
3088 if (errcode != REG_OK)
3090 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
3091 if (errcode != REG_OK)
3093 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
3097 iter = (tre_iteration_t *)node->obj;
3098 assert(iter->max == -1 || iter->max == 1);
3100 if (iter->max == -1)
3102 assert(iter->min == 0 || iter->min == 1);
3103 /* Add a transition from each last position in the iterated
3104 expression to each first position. */
3105 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
3106 transitions, counts, offs);
3107 if (errcode != REG_OK)
3110 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
3117 #define ERROR_EXIT(err) \
3121 if (/*CONSTCOND*/1) \
3124 while (/*CONSTCOND*/0)
3128 regcomp(regex_t *preg, const char *regex, int cflags)
3131 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
3132 tre_pos_and_tags_t *p;
3133 int *counts = NULL, *offs = NULL;
3135 tre_tnfa_transition_t *transitions, *initial;
3136 tre_tnfa_t *tnfa = NULL;
3137 tre_submatch_data_t *submatch_data;
3138 tre_tag_direction_t *tag_directions = NULL;
3139 reg_errcode_t errcode;
3142 /* Parse context. */
3143 tre_parse_ctx_t parse_ctx;
3145 /* Allocate a stack used throughout the compilation process for various
3147 stack = tre_stack_new(512, 10240, 128);
3150 /* Allocate a fast memory allocator. */
3151 mem = tre_mem_new();
3154 tre_stack_destroy(stack);
3158 /* Parse the regexp. */
3159 memset(&parse_ctx, 0, sizeof(parse_ctx));
3160 parse_ctx.mem = mem;
3161 parse_ctx.stack = stack;
3162 parse_ctx.re = regex;
3163 parse_ctx.cflags = cflags;
3164 parse_ctx.max_backref = -1;
3165 errcode = tre_parse(&parse_ctx);
3166 if (errcode != REG_OK)
3167 ERROR_EXIT(errcode);
3168 preg->re_nsub = parse_ctx.submatch_id - 1;
3169 tree = parse_ctx.result;
3171 /* Back references and approximate matching cannot currently be used
3172 in the same regexp. */
3173 if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
3174 ERROR_EXIT(REG_BADPAT);
3177 tre_ast_print(tree);
3178 #endif /* TRE_DEBUG */
3180 /* Referring to nonexistent subexpressions is illegal. */
3181 if (parse_ctx.max_backref > (int)preg->re_nsub)
3182 ERROR_EXIT(REG_ESUBREG);
3184 /* Allocate the TNFA struct. */
3185 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
3187 ERROR_EXIT(REG_ESPACE);
3188 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
3189 tnfa->have_approx = parse_ctx.have_approx;
3190 tnfa->num_submatches = parse_ctx.submatch_id;
3192 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
3193 regexp does not have back references, this can be skipped. */
3194 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
3197 /* Figure out how many tags we will need. */
3198 errcode = tre_add_tags(NULL, stack, tree, tnfa);
3199 if (errcode != REG_OK)
3200 ERROR_EXIT(errcode);
3202 if (tnfa->num_tags > 0)
3204 tag_directions = xmalloc(sizeof(*tag_directions)
3205 * (tnfa->num_tags + 1));
3206 if (tag_directions == NULL)
3207 ERROR_EXIT(REG_ESPACE);
3208 tnfa->tag_directions = tag_directions;
3209 memset(tag_directions, -1,
3210 sizeof(*tag_directions) * (tnfa->num_tags + 1));
3212 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
3213 sizeof(tnfa->minimal_tags));
3214 if (tnfa->minimal_tags == NULL)
3215 ERROR_EXIT(REG_ESPACE);
3217 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
3218 sizeof(*submatch_data));
3219 if (submatch_data == NULL)
3220 ERROR_EXIT(REG_ESPACE);
3221 tnfa->submatch_data = submatch_data;
3223 errcode = tre_add_tags(mem, stack, tree, tnfa);
3224 if (errcode != REG_OK)
3225 ERROR_EXIT(errcode);
3229 /* Expand iteration nodes. */
3230 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
3231 tag_directions, &tnfa->params_depth);
3232 if (errcode != REG_OK)
3233 ERROR_EXIT(errcode);
3235 /* Add a dummy node for the final state.
3236 XXX - For certain patterns this dummy node can be optimized away,
3237 for example "a*" or "ab*". Figure out a simple way to detect
3238 this possibility. */
3240 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
3241 if (tmp_ast_r == NULL)
3242 ERROR_EXIT(REG_ESPACE);
3244 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
3246 ERROR_EXIT(REG_ESPACE);
3248 errcode = tre_compute_nfl(mem, stack, tree);
3249 if (errcode != REG_OK)
3250 ERROR_EXIT(errcode);
3252 counts = xmalloc(sizeof(int) * parse_ctx.position);
3254 ERROR_EXIT(REG_ESPACE);
3256 offs = xmalloc(sizeof(int) * parse_ctx.position);
3258 ERROR_EXIT(REG_ESPACE);
3260 for (i = 0; i < parse_ctx.position; i++)
3262 tre_ast_to_tnfa(tree, NULL, counts, NULL);
3265 for (i = 0; i < parse_ctx.position; i++)
3268 add += counts[i] + 1;
3271 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
3272 if (transitions == NULL)
3273 ERROR_EXIT(REG_ESPACE);
3274 tnfa->transitions = transitions;
3275 tnfa->num_transitions = add;
3277 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
3278 if (errcode != REG_OK)
3279 ERROR_EXIT(errcode);
3281 tnfa->firstpos_chars = NULL;
3285 while (p->position >= 0)
3291 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
3292 if (initial == NULL)
3293 ERROR_EXIT(REG_ESPACE);
3294 tnfa->initial = initial;
3297 for (p = tree->firstpos; p->position >= 0; p++)
3299 initial[i].state = transitions + offs[p->position];
3300 initial[i].state_id = p->position;
3301 initial[i].tags = NULL;
3302 /* Copy the arrays p->tags, and p->params, they are allocated
3303 from a tre_mem object. */
3307 for (j = 0; p->tags[j] >= 0; j++);
3308 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
3309 if (!initial[i].tags)
3310 ERROR_EXIT(REG_ESPACE);
3311 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
3313 initial[i].assertions = p->assertions;
3316 initial[i].state = NULL;
3318 tnfa->num_transitions = add;
3319 tnfa->final = transitions + offs[tree->lastpos[0].position];
3320 tnfa->num_states = parse_ctx.position;
3321 tnfa->cflags = cflags;
3323 tre_mem_destroy(mem);
3324 tre_stack_destroy(stack);
3328 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3332 /* Free everything that was allocated and return the error code. */
3333 tre_mem_destroy(mem);
3335 tre_stack_destroy(stack);
3340 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3349 regfree(regex_t *preg)
3353 tre_tnfa_transition_t *trans;
3355 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
3359 for (i = 0; i < tnfa->num_transitions; i++)
3360 if (tnfa->transitions[i].state)
3362 if (tnfa->transitions[i].tags)
3363 xfree(tnfa->transitions[i].tags);
3364 if (tnfa->transitions[i].neg_classes)
3365 xfree(tnfa->transitions[i].neg_classes);
3367 if (tnfa->transitions)
3368 xfree(tnfa->transitions);
3372 for (trans = tnfa->initial; trans->state; trans++)
3377 xfree(tnfa->initial);
3380 if (tnfa->submatch_data)
3382 for (i = 0; i < tnfa->num_submatches; i++)
3383 if (tnfa->submatch_data[i].parents)
3384 xfree(tnfa->submatch_data[i].parents);
3385 xfree(tnfa->submatch_data);
3388 if (tnfa->tag_directions)
3389 xfree(tnfa->tag_directions);
3390 if (tnfa->firstpos_chars)
3391 xfree(tnfa->firstpos_chars);
3392 if (tnfa->minimal_tags)
3393 xfree(tnfa->minimal_tags);