src/regex/regcomp.c

   1 /*
   2   regcomp.c - TRE POSIX compatible regex compilation functions.
   3
   4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   5   All rights reserved.
   6
   7   Redistribution and use in source and binary forms, with or without
   8   modification, are permitted provided that the following conditions
   9   are met:
  10
  11     1. Redistributions of source code must retain the above copyright
  12        notice, this list of conditions and the following disclaimer.
  13
  14     2. Redistributions in binary form must reproduce the above copyright
  15        notice, this list of conditions and the following disclaimer in the
  16        documentation and/or other materials provided with the distribution.
  17
  18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 */
  31
  32 #include <string.h>
  33 #include <stdlib.h>
  34 #include <regex.h>
  35 #include <limits.h>
  36 #include <stdint.h>
  37 #include <ctype.h>
  38
  39 #include "tre.h"
  40
  41 #include <assert.h>
  42
  43 /***********************************************************************
  44  from tre-compile.h
  45 ***********************************************************************/
  46
  47 typedef struct {
  48   int position;
  49   int code_min;
  50   int code_max;
  51   int *tags;
  52   int assertions;
  53   tre_ctype_t class;
  54   tre_ctype_t *neg_classes;
  55   int backref;
  56 } tre_pos_and_tags_t;
  57
  58
  59 /***********************************************************************
  60  from tre-ast.c and tre-ast.h
  61 ***********************************************************************/
  62
  63 /* The different AST node types. */
  64 typedef enum {
  65   LITERAL,
  66   CATENATION,
  67   ITERATION,
  68   UNION
  69 } tre_ast_type_t;
  70
  71 /* Special subtypes of TRE_LITERAL. */
  72 #define EMPTY     -1   /* Empty leaf (denotes empty string). */
  73 #define ASSERTION -2   /* Assertion leaf. */
  74 #define TAG       -3   /* Tag leaf. */
  75 #define BACKREF   -4   /* Back reference leaf. */
  76
  77 #define IS_SPECIAL(x)   ((x)->code_min < 0)
  78 #define IS_EMPTY(x)     ((x)->code_min == EMPTY)
  79 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
  80 #define IS_TAG(x)       ((x)->code_min == TAG)
  81 #define IS_BACKREF(x)   ((x)->code_min == BACKREF)
  82
  83
  84 /* A generic AST node.  All AST nodes consist of this node on the top
  85    level with `obj' pointing to the actual content. */
  86 typedef struct {
  87   tre_ast_type_t type;   /* Type of the node. */
  88   void *obj;             /* Pointer to actual node. */
  89   int nullable;
  90   int submatch_id;
  91   int num_submatches;
  92   int num_tags;
  93   tre_pos_and_tags_t *firstpos;
  94   tre_pos_and_tags_t *lastpos;
  95 } tre_ast_node_t;
  96
  97
  98 /* A "literal" node.  These are created for assertions, back references,
  99    tags, matching parameter settings, and all expressions that match one
 100    character. */
 101 typedef struct {
 102   long code_min;
 103   long code_max;
 104   int position;
 105   tre_ctype_t class;
 106   tre_ctype_t *neg_classes;
 107 } tre_literal_t;
 108
 109 /* A "catenation" node.  These are created when two regexps are concatenated.
 110    If there are more than one subexpressions in sequence, the `left' part
 111    holds all but the last, and `right' part holds the last subexpression
 112    (catenation is left associative). */
 113 typedef struct {
 114   tre_ast_node_t *left;
 115   tre_ast_node_t *right;
 116 } tre_catenation_t;
 117
 118 /* An "iteration" node.  These are created for the "*", "+", "?", and "{m,n}"
 119    operators. */
 120 typedef struct {
 121   /* Subexpression to match. */
 122   tre_ast_node_t *arg;
 123   /* Minimum number of consecutive matches. */
 124   int min;
 125   /* Maximum number of consecutive matches. */
 126   int max;
 127   /* If 0, match as many characters as possible, if 1 match as few as
 128      possible.  Note that this does not always mean the same thing as
 129      matching as many/few repetitions as possible. */
 130   unsigned int minimal:1;
 131 } tre_iteration_t;
 132
 133 /* An "union" node.  These are created for the "|" operator. */
 134 typedef struct {
 135   tre_ast_node_t *left;
 136   tre_ast_node_t *right;
 137 } tre_union_t;
 138
 139
 140 static tre_ast_node_t *
 141 tre_ast_new_node(tre_mem_t mem, int type, void *obj)
 142 {
 143         tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
 144         if (!node || !obj)
 145                 return 0;
 146         node->obj = obj;
 147         node->type = type;
 148         node->nullable = -1;
 149         node->submatch_id = -1;
 150         return node;
 151 }
 152
 153 static tre_ast_node_t *
 154 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
 155 {
 156         tre_ast_node_t *node;
 157         tre_literal_t *lit;
 158
 159         lit = tre_mem_calloc(mem, sizeof *lit);
 160         node = tre_ast_new_node(mem, LITERAL, lit);
 161         if (!node)
 162                 return 0;
 163         lit->code_min = code_min;
 164         lit->code_max = code_max;
 165         lit->position = position;
 166         return node;
 167 }
 168
 169 static tre_ast_node_t *
 170 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
 171 {
 172         tre_ast_node_t *node;
 173         tre_iteration_t *iter;
 174
 175         iter = tre_mem_calloc(mem, sizeof *iter);
 176         node = tre_ast_new_node(mem, ITERATION, iter);
 177         if (!node)
 178                 return 0;
 179         iter->arg = arg;
 180         iter->min = min;
 181         iter->max = max;
 182         iter->minimal = minimal;
 183         node->num_submatches = arg->num_submatches;
 184         return node;
 185 }
 186
 187 static tre_ast_node_t *
 188 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 189 {
 190         tre_ast_node_t *node;
 191         tre_union_t *un;
 192
 193         if (!left)
 194                 return right;
 195         un = tre_mem_calloc(mem, sizeof *un);
 196         node = tre_ast_new_node(mem, UNION, un);
 197         if (!node || !right)
 198                 return 0;
 199         un->left = left;
 200         un->right = right;
 201         node->num_submatches = left->num_submatches + right->num_submatches;
 202         return node;
 203 }
 204
 205 static tre_ast_node_t *
 206 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 207 {
 208         tre_ast_node_t *node;
 209         tre_catenation_t *cat;
 210
 211         if (!left)
 212                 return right;
 213         cat = tre_mem_calloc(mem, sizeof *cat);
 214         node = tre_ast_new_node(mem, CATENATION, cat);
 215         if (!node)
 216                 return 0;
 217         cat->left = left;
 218         cat->right = right;
 219         node->num_submatches = left->num_submatches + right->num_submatches;
 220         return node;
 221 }
 222
 223
 224 /***********************************************************************
 225  from tre-stack.c and tre-stack.h
 226 ***********************************************************************/
 227
 228 typedef struct tre_stack_rec tre_stack_t;
 229
 230 /* Creates a new stack object.  `size' is initial size in bytes, `max_size'
 231    is maximum size, and `increment' specifies how much more space will be
 232    allocated with realloc() if all space gets used up.  Returns the stack
 233    object or NULL if out of memory. */
 234 static tre_stack_t *
 235 tre_stack_new(int size, int max_size, int increment);
 236
 237 /* Frees the stack object. */
 238 static void
 239 tre_stack_destroy(tre_stack_t *s);
 240
 241 /* Returns the current number of objects in the stack. */
 242 static int
 243 tre_stack_num_objects(tre_stack_t *s);
 244
 245 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
 246    `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
 247    This tries to realloc() more space before failing if maximum size
 248    has not yet been reached.  Returns REG_OK if successful. */
 249 #define declare_pushf(typetag, type)                                          \
 250   static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
 251
 252 declare_pushf(voidptr, void *);
 253 declare_pushf(int, int);
 254
 255 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
 256    element off of stack `s' and returns it.  The stack must not be
 257    empty. */
 258 #define declare_popf(typetag, type)               \
 259   static type tre_stack_pop_ ## typetag(tre_stack_t *s)
 260
 261 declare_popf(voidptr, void *);
 262 declare_popf(int, int);
 263
 264 /* Just to save some typing. */
 265 #define STACK_PUSH(s, typetag, value)                                         \
 266   do                                                                          \
 267     {                                                                         \
 268       status = tre_stack_push_ ## typetag(s, value);                          \
 269     }                                                                         \
 270   while (/*CONSTCOND*/0)
 271
 272 #define STACK_PUSHX(s, typetag, value)                                        \
 273   {                                                                           \
 274     status = tre_stack_push_ ## typetag(s, value);                            \
 275     if (status != REG_OK)                                                     \
 276       break;                                                                  \
 277   }
 278
 279 #define STACK_PUSHR(s, typetag, value)                                        \
 280   {                                                                           \
 281     reg_errcode_t _status;                                                    \
 282     _status = tre_stack_push_ ## typetag(s, value);                           \
 283     if (_status != REG_OK)                                                    \
 284       return _status;                                                         \
 285   }
 286
 287 union tre_stack_item {
 288   void *voidptr_value;
 289   int int_value;
 290 };
 291
 292 struct tre_stack_rec {
 293   int size;
 294   int max_size;
 295   int increment;
 296   int ptr;
 297   union tre_stack_item *stack;
 298 };
 299
 300
 301 static tre_stack_t *
 302 tre_stack_new(int size, int max_size, int increment)
 303 {
 304   tre_stack_t *s;
 305
 306   s = xmalloc(sizeof(*s));
 307   if (s != NULL)
 308     {
 309       s->stack = xmalloc(sizeof(*s->stack) * size);
 310       if (s->stack == NULL)
 311         {
 312           xfree(s);
 313           return NULL;
 314         }
 315       s->size = size;
 316       s->max_size = max_size;
 317       s->increment = increment;
 318       s->ptr = 0;
 319     }
 320   return s;
 321 }
 322
 323 static void
 324 tre_stack_destroy(tre_stack_t *s)
 325 {
 326   xfree(s->stack);
 327   xfree(s);
 328 }
 329
 330 static int
 331 tre_stack_num_objects(tre_stack_t *s)
 332 {
 333   return s->ptr;
 334 }
 335
 336 static reg_errcode_t
 337 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
 338 {
 339   if (s->ptr < s->size)
 340     {
 341       s->stack[s->ptr] = value;
 342       s->ptr++;
 343     }
 344   else
 345     {
 346       if (s->size >= s->max_size)
 347         {
 348           return REG_ESPACE;
 349         }
 350       else
 351         {
 352           union tre_stack_item *new_buffer;
 353           int new_size;
 354           new_size = s->size + s->increment;
 355           if (new_size > s->max_size)
 356             new_size = s->max_size;
 357           new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
 358           if (new_buffer == NULL)
 359             {
 360               return REG_ESPACE;
 361             }
 362           assert(new_size > s->size);
 363           s->size = new_size;
 364           s->stack = new_buffer;
 365           tre_stack_push(s, value);
 366         }
 367     }
 368   return REG_OK;
 369 }
 370
 371 #define define_pushf(typetag, type)  \
 372   declare_pushf(typetag, type) {     \
 373     union tre_stack_item item;       \
 374     item.typetag ## _value = value;  \
 375     return tre_stack_push(s, item);  \
 376 }
 377
 378 define_pushf(int, int)
 379 define_pushf(voidptr, void *)
 380
 381 #define define_popf(typetag, type)                  \
 382   declare_popf(typetag, type) {                     \
 383     return s->stack[--s->ptr].typetag ## _value;    \
 384   }
 385
 386 define_popf(int, int)
 387 define_popf(voidptr, void *)
 388
 389
 390 /***********************************************************************
 391  from tre-parse.c and tre-parse.h
 392 ***********************************************************************/
 393
 394 /* Parse context. */
 395 typedef struct {
 396         /* Memory allocator. The AST is allocated using this. */
 397         tre_mem_t mem;
 398         /* Stack used for keeping track of regexp syntax. */
 399         tre_stack_t *stack;
 400         /* The parsed node after a parse function returns. */
 401         tre_ast_node_t *n;
 402         /* Position in the regexp pattern after a parse function returns. */
 403         const char *s;
 404         /* The first character of the regexp. */
 405         const char *re;
 406         /* Current submatch ID. */
 407         int submatch_id;
 408         /* Current position (number of literal). */
 409         int position;
 410         /* The highest back reference or -1 if none seen so far. */
 411         int max_backref;
 412         /* Compilation flags. */
 413         int cflags;
 414 } tre_parse_ctx_t;
 415
 416 /* Some macros for expanding \w, \s, etc. */
 417 static const struct {
 418         char c;
 419         const char *expansion;
 420 } tre_macros[] = {
 421         {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
 422         {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
 423         {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
 424         {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
 425         { 0, 0 }
 426 };
 427
 428 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
 429    must have at least `len' items.  Sets buf[0] to zero if the there
 430    is no match in `tre_macros'. */
 431 static const char *tre_expand_macro(const char *s)
 432 {
 433         int i;
 434         for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
 435         return tre_macros[i].expansion;
 436 }
 437
 438 static int
 439 tre_compare_lit(const void *a, const void *b)
 440 {
 441         const tre_literal_t *const *la = a;
 442         const tre_literal_t *const *lb = b;
 443         /* assumes the range of valid code_min is < INT_MAX */
 444         return la[0]->code_min - lb[0]->code_min;
 445 }
 446
 447 struct literals {
 448         tre_mem_t mem;
 449         tre_literal_t **a;
 450         int len;
 451         int cap;
 452 };
 453
 454 static tre_literal_t *tre_new_lit(struct literals *p)
 455 {
 456         tre_literal_t **a;
 457         if (p->len >= p->cap) {
 458                 if (p->cap >= 1<<15)
 459                         return 0;
 460                 p->cap *= 2;
 461                 a = xrealloc(p->a, p->cap * sizeof *p->a);
 462                 if (!a)
 463                         return 0;
 464                 p->a = a;
 465         }
 466         a = p->a + p->len++;
 467         *a = tre_mem_calloc(p->mem, sizeof **a);
 468         return *a;
 469 }
 470
 471 static int add_icase_literals(struct literals *ls, int min, int max)
 472 {
 473         tre_literal_t *lit;
 474         int b, e, c;
 475         for (c=min; c<=max; ) {
 476                 /* assumes islower(c) and isupper(c) are exclusive
 477                    and toupper(c)!=c if islower(c).
 478                    multiple opposite case characters are not supported */
 479                 if (tre_islower(c)) {
 480                         b = e = tre_toupper(c);
 481                         for (c++, e++; c<=max; c++, e++)
 482                                 if (tre_toupper(c) != e) break;
 483                 } else if (tre_isupper(c)) {
 484                         b = e = tre_tolower(c);
 485                         for (c++, e++; c<=max; c++, e++)
 486                                 if (tre_tolower(c) != e) break;
 487                 } else {
 488                         c++;
 489                         continue;
 490                 }
 491                 lit = tre_new_lit(ls);
 492                 if (!lit)
 493                         return -1;
 494                 lit->code_min = b;
 495                 lit->code_max = e-1;
 496                 lit->position = -1;
 497         }
 498         return 0;
 499 }
 500
 501
 502 /* Maximum number of character classes in a negated bracket expression. */
 503 #define MAX_NEG_CLASSES 64
 504
 505 struct neg {
 506         int negate;
 507         int len;
 508         tre_ctype_t a[MAX_NEG_CLASSES];
 509 };
 510
 511 // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
 512
 513 /*
 514 bracket grammar:
 515 Bracket  =  '[' List ']'  |  '[^' List ']'
 516 List     =  Term  |  List Term
 517 Term     =  Char  |  Range  |  Chclass  |  Eqclass
 518 Range    =  Char '-' Char  |  Char '-' '-'
 519 Char     =  Coll  |  coll_single
 520 Meta     =  ']'  |  '-'
 521 Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
 522 Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
 523 Chclass  =  '[:' class ':]'
 524
 525 coll_single is a single char collating element but it can be
 526  '-' only at the beginning or end of a List and
 527  ']' only at the beginning of a List and
 528  '^' anywhere except after the openning '['
 529 */
 530
 531 static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
 532 {
 533         const char *start = s;
 534         tre_ctype_t class;
 535         int min, max;
 536         wchar_t wc;
 537         int len;
 538
 539         for (;;) {
 540                 class = 0;
 541                 len = mbtowc(&wc, s, -1);
 542                 if (len <= 0)
 543                         return *s ? REG_BADPAT : REG_EBRACK;
 544                 if (*s == ']' && s != start) {
 545                         ctx->s = s+1;
 546                         return REG_OK;
 547                 }
 548                 if (*s == '-' && s != start && s[1] != ']' &&
 549                     /* extension: [a-z--@] is accepted as [a-z]|[--@] */
 550                     (s[1] != '-' || s[2] == ']'))
 551                         return REG_ERANGE;
 552                 if (*s == '[' && (s[1] == '.' || s[1] == '='))
 553                         /* collating symbols and equivalence classes are not supported */
 554                         return REG_ECOLLATE;
 555                 if (*s == '[' && s[1] == ':') {
 556                         char tmp[CHARCLASS_NAME_MAX+1];
 557                         s += 2;
 558                         for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
 559                                 if (s[len] == ':') {
 560                                         memcpy(tmp, s, len);
 561                                         tmp[len] = 0;
 562                                         class = tre_ctype(tmp);
 563                                         break;
 564                                 }
 565                         }
 566                         if (!class || s[len+1] != ']')
 567                                 return REG_ECTYPE;
 568                         min = 0;
 569                         max = TRE_CHAR_MAX;
 570                         s += len+2;
 571                 } else {
 572                         min = max = wc;
 573                         s += len;
 574                         if (*s == '-' && s[1] != ']') {
 575                                 s++;
 576                                 len = mbtowc(&wc, s, -1);
 577                                 max = wc;
 578                                 /* XXX - Should use collation order instead of
 579                                    encoding values in character ranges. */
 580                                 if (len <= 0 || min > max)
 581                                         return REG_ERANGE;
 582                                 s += len;
 583                         }
 584                 }
 585
 586                 if (class && neg->negate) {
 587                         if (neg->len >= MAX_NEG_CLASSES)
 588                                 return REG_ESPACE;
 589                         neg->a[neg->len++] = class;
 590                 } else  {
 591                         tre_literal_t *lit = tre_new_lit(ls);
 592                         if (!lit)
 593                                 return REG_ESPACE;
 594                         lit->code_min = min;
 595                         lit->code_max = max;
 596                         lit->class = class;
 597                         lit->position = -1;
 598
 599                         /* Add opposite-case codepoints if REG_ICASE is present.
 600                            It seems that POSIX requires that bracket negation
 601                            should happen before case-folding, but most practical
 602                            implementations do it the other way around. Changing
 603                            the order would need efficient representation of
 604                            case-fold ranges and bracket range sets even with
 605                            simple patterns so this is ok for now. */
 606                         if (ctx->cflags & REG_ICASE && !class)
 607                                 if (add_icase_literals(ls, min, max))
 608                                         return REG_ESPACE;
 609                 }
 610         }
 611 }
 612
 613 static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
 614 {
 615         int i, max, min, negmax, negmin;
 616         tre_ast_node_t *node = 0, *n;
 617         tre_ctype_t *nc = 0;
 618         tre_literal_t *lit;
 619         struct literals ls;
 620         struct neg neg;
 621         reg_errcode_t err;
 622
 623         ls.mem = ctx->mem;
 624         ls.len = 0;
 625         ls.cap = 32;
 626         ls.a = xmalloc(ls.cap * sizeof *ls.a);
 627         if (!ls.a)
 628                 return REG_ESPACE;
 629         neg.len = 0;
 630         neg.negate = *s == '^';
 631         if (neg.negate)
 632                 s++;
 633
 634         err = parse_bracket_terms(ctx, s, &ls, &neg);
 635         if (err != REG_OK)
 636                 goto parse_bracket_done;
 637
 638         if (neg.negate) {
 639                 /* Sort the array if we need to negate it. */
 640                 qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
 641                 /* extra lit for the last negated range */
 642                 lit = tre_new_lit(&ls);
 643                 if (!lit) {
 644                         err = REG_ESPACE;
 645                         goto parse_bracket_done;
 646                 }
 647                 lit->code_min = TRE_CHAR_MAX+1;
 648                 lit->code_max = TRE_CHAR_MAX+1;
 649                 lit->position = -1;
 650                 /* negated classes */
 651                 if (neg.len) {
 652                         nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
 653                         if (!nc) {
 654                                 err = REG_ESPACE;
 655                                 goto parse_bracket_done;
 656                         }
 657                         memcpy(nc, neg.a, neg.len*sizeof *neg.a);
 658                         nc[neg.len] = 0;
 659                 }
 660         }
 661
 662         /* Build a union of the items in the array, negated if necessary. */
 663         negmax = negmin = 0;
 664         for (i = 0; i < ls.len; i++) {
 665                 lit = ls.a[i];
 666                 min = lit->code_min;
 667                 max = lit->code_max;
 668                 if (neg.negate) {
 669                         if (min <= negmin) {
 670                                 /* Overlap. */
 671                                 negmin = MAX(max + 1, negmin);
 672                                 continue;
 673                         }
 674                         negmax = min - 1;
 675                         lit->code_min = negmin;
 676                         lit->code_max = negmax;
 677                         negmin = max + 1;
 678                 }
 679                 lit->position = ctx->position;
 680                 lit->neg_classes = nc;
 681                 n = tre_ast_new_node(ctx->mem, LITERAL, lit);
 682                 node = tre_ast_new_union(ctx->mem, node, n);
 683                 if (!node) {
 684                         err = REG_ESPACE;
 685                         break;
 686                 }
 687         }
 688
 689 parse_bracket_done:
 690         xfree(ls.a);
 691         ctx->position++;
 692         ctx->n = node;
 693         return err;
 694 }
 695
 696 static const char *parse_dup_count(const char *s, int *n)
 697 {
 698         *n = -1;
 699         if (!isdigit(*s))
 700                 return s;
 701         *n = 0;
 702         for (;;) {
 703                 *n = 10 * *n + (*s - '0');
 704                 s++;
 705                 if (!isdigit(*s) || *n > RE_DUP_MAX)
 706                         break;
 707         }
 708         return s;
 709 }
 710
 711 static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax)
 712 {
 713         int min, max;
 714
 715         s = parse_dup_count(s, &min);
 716         if (*s == ',')
 717                 s = parse_dup_count(s+1, &max);
 718         else
 719                 max = min;
 720
 721         if (
 722                 (max < min && max >= 0) ||
 723                 max > RE_DUP_MAX ||
 724                 min > RE_DUP_MAX ||
 725                 min < 0 ||
 726                 (!ere && *s++ != '\\') ||
 727                 *s++ != '}'
 728         )
 729                 return 0;
 730         *pmin = min;
 731         *pmax = max;
 732         return s;
 733 }
 734
 735 static int hexval(unsigned c)
 736 {
 737         if (c-'0'<10) return c-'0';
 738         c |= 32;
 739         if (c-'a'<6) return c-'a'+10;
 740         return -1;
 741 }
 742
 743 static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
 744 {
 745         if (node->submatch_id >= 0) {
 746                 tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 747                 if (!n)
 748                         return REG_ESPACE;
 749                 n = tre_ast_new_catenation(ctx->mem, n, node);
 750                 if (!n)
 751                         return REG_ESPACE;
 752                 n->num_submatches = node->num_submatches;
 753                 node = n;
 754         }
 755         node->submatch_id = subid;
 756         node->num_submatches++;
 757         ctx->n = node;
 758         return REG_OK;
 759 }
 760
 761 /*
 762 BRE grammar:
 763 Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
 764 Branch =  Atom  |  Branch Atom
 765 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
 766 Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
 767
 768 (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
 769
 770 ERE grammar:
 771 Regex  =  Branch  |  Regex '|' Branch
 772 Branch =  Atom  |  Branch Atom
 773 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
 774 Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
 775
 776 (a*+?, ^*, $+, \X, {, (|a) are unspecified)
 777 */
 778
 779 static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
 780 {
 781         int len, ere = ctx->cflags & REG_EXTENDED;
 782         const char *p;
 783         tre_ast_node_t *node;
 784         wchar_t wc;
 785         switch (*s) {
 786         case '[':
 787                 return parse_bracket(ctx, s+1);
 788         case '\\':
 789                 p = tre_expand_macro(s+1);
 790                 if (p) {
 791                         /* assume \X expansion is a single atom */
 792                         reg_errcode_t err = parse_atom(ctx, p);
 793                         ctx->s = s+2;
 794                         return err;
 795                 }
 796                 /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
 797                 switch (*++s) {
 798                 case 0:
 799                         return REG_EESCAPE;
 800                 case 'b':
 801                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
 802                         break;
 803                 case 'B':
 804                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
 805                         break;
 806                 case '<':
 807                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
 808                         break;
 809                 case '>':
 810                         node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
 811                         break;
 812                 case 'x':
 813                         s++;
 814                         int i, v = 0, c;
 815                         len = 2;
 816                         if (*s == '{') {
 817                                 len = 8;
 818                                 s++;
 819                         }
 820                         for (i=0; i<len && v<0x110000; i++) {
 821                                 c = hexval(s[i]);
 822                                 if (c < 0) break;
 823                                 v = 16*v + c;
 824                         }
 825                         s += i;
 826                         if (len == 8) {
 827                                 if (*s != '}')
 828                                         return REG_EBRACE;
 829                                 s++;
 830                         }
 831                         node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
 832                         s--;
 833                         break;
 834                 case '{':
 835                 case '+':
 836                 case '?':
 837                         /* extension: treat \+, \? as repetitions in BRE */
 838                         /* reject repetitions after empty expression in BRE */
 839                         if (!ere)
 840                                 return REG_BADRPT;
 841                 case '|':
 842                         /* extension: treat \| as alternation in BRE */
 843                         if (!ere) {
 844                                 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 845                                 s--;
 846                                 goto end;
 847                         }
 848                         /* fallthrough */
 849                 default:
 850                         if (!ere && (unsigned)*s-'1' < 9) {
 851                                 /* back reference */
 852                                 int val = *s - '0';
 853                                 node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
 854                                 ctx->max_backref = MAX(val, ctx->max_backref);
 855                         } else {
 856                                 /* extension: accept unknown escaped char
 857                                    as a literal */
 858                                 goto parse_literal;
 859                         }
 860                 }
 861                 s++;
 862                 break;
 863         case '.':
 864                 if (ctx->cflags & REG_NEWLINE) {
 865                         tre_ast_node_t *tmp1, *tmp2;
 866                         tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
 867                         tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
 868                         if (tmp1 && tmp2)
 869                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 870                         else
 871                                 node = 0;
 872                 } else {
 873                         node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
 874                 }
 875                 s++;
 876                 break;
 877         case '^':
 878                 /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
 879                 if (!ere && s != ctx->re)
 880                         goto parse_literal;
 881                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
 882                 s++;
 883                 break;
 884         case '$':
 885                 /* '$' is special everywhere in EREs, and in the end of the string in BREs. */
 886                 if (!ere && s[1])
 887                         goto parse_literal;
 888                 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
 889                 s++;
 890                 break;
 891         case '*':
 892         case '{':
 893         case '+':
 894         case '?':
 895                 /* reject repetitions after empty expression in ERE */
 896                 if (ere)
 897                         return REG_BADRPT;
 898         case '|':
 899                 if (!ere)
 900                         goto parse_literal;
 901         case 0:
 902                 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 903                 break;
 904         default:
 905 parse_literal:
 906                 len = mbtowc(&wc, s, -1);
 907                 if (len < 0)
 908                         return REG_BADPAT;
 909                 if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
 910                         tre_ast_node_t *tmp1, *tmp2;
 911                         /* multiple opposite case characters are not supported */
 912                         tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
 913                         tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
 914                         if (tmp1 && tmp2)
 915                                 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
 916                         else
 917                                 node = 0;
 918                 } else {
 919                         node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
 920                 }
 921                 ctx->position++;
 922                 s += len;
 923                 break;
 924         }
 925 end:
 926         if (!node)
 927                 return REG_ESPACE;
 928         ctx->n = node;
 929         ctx->s = s;
 930         return REG_OK;
 931 }
 932
 933 #define PUSHPTR(err, s, v) do { \
 934         if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
 935                 return err; \
 936 } while(0)
 937
 938 #define PUSHINT(err, s, v) do { \
 939         if ((err = tre_stack_push_int(s, v)) != REG_OK) \
 940                 return err; \
 941 } while(0)
 942
 943 static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
 944 {
 945         tre_ast_node_t *nbranch=0, *nunion=0;
 946         int ere = ctx->cflags & REG_EXTENDED;
 947         const char *s = ctx->re;
 948         int subid = 0;
 949         int depth = 0;
 950         reg_errcode_t err;
 951         tre_stack_t *stack = ctx->stack;
 952
 953         PUSHINT(err, stack, subid++);
 954         for (;;) {
 955                 if ((!ere && *s == '\\' && s[1] == '(') ||
 956                     (ere && *s == '(')) {
 957                         PUSHPTR(err, stack, nunion);
 958                         PUSHPTR(err, stack, nbranch);
 959                         PUSHINT(err, stack, subid++);
 960                         s++;
 961                         if (!ere)
 962                                 s++;
 963                         depth++;
 964                         nbranch = nunion = 0;
 965                         continue;
 966                 }
 967                 if ((!ere && *s == '\\' && s[1] == ')') ||
 968                     (ere && *s == ')' && depth)) {
 969                         ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 970                         if (!ctx->n)
 971                                 return REG_ESPACE;
 972                 } else {
 973                         err = parse_atom(ctx, s);
 974                         if (err != REG_OK)
 975                                 return err;
 976                         s = ctx->s;
 977                 }
 978
 979         parse_iter:
 980                 for (;;) {
 981                         int min, max;
 982
 983                         if (*s!='\\' && *s!='*') {
 984                                 if (!ere)
 985                                         break;
 986                                 if (*s!='+' && *s!='?' && *s!='{')
 987                                         break;
 988                         }
 989                         if (*s=='\\' && ere)
 990                                 break;
 991                         /* extension: treat \+, \? as repetitions in BRE */
 992                         if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{')
 993                                 break;
 994                         if (*s=='\\')
 995                                 s++;
 996
 997                         /* handle ^* at the start of a complete BRE. */
 998                         if (!ere && s==ctx->re+1 && s[-1]=='^')
 999                                 break;
1000
1001                         /* extension: multiple consecutive *+?{,} is unspecified,
1002                            but (a+)+ has to be supported so accepting a++ makes
1003                            sense, note however that the RE_DUP_MAX limit can be
1004                            circumvented: (a{255}){255} uses a lot of memory.. */
1005                         if (*s=='{') {
1006                                 s = parse_dup(s+1, ere, &min, &max);
1007                                 if (!s)
1008                                         return REG_BADBR;
1009                         } else {
1010                                 min=0;
1011                                 max=-1;
1012                                 if (*s == '+')
1013                                         min = 1;
1014                                 if (*s == '?')
1015                                         max = 1;
1016                                 s++;
1017                         }
1018                         if (max == 0)
1019                                 ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1020                         else
1021                                 ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
1022                         if (!ctx->n)
1023                                 return REG_ESPACE;
1024                 }
1025
1026                 nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1027                 if ((ere && *s == '|') ||
1028                     (ere && *s == ')' && depth) ||
1029                     (!ere && *s == '\\' && s[1] == ')') ||
1030                     /* extension: treat \| as alternation in BRE */
1031                     (!ere && *s == '\\' && s[1] == '|') ||
1032                     !*s) {
1033                         /* extension: empty branch is unspecified (), (|a), (a|)
1034                            here they are not rejected but match on empty string */
1035                         int c = *s;
1036                         nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1037                         nbranch = 0;
1038
1039                         if (c == '\\' && s[1] == '|') {
1040                                 s+=2;
1041                         } else if (c == '|') {
1042                                 s++;
1043                         } else {
1044                                 if (c == '\\') {
1045                                         if (!depth) return REG_EPAREN;
1046                                         s+=2;
1047                                 } else if (c == ')')
1048                                         s++;
1049                                 depth--;
1050                                 err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1051                                 if (err != REG_OK)
1052                                         return err;
1053                                 if (!c && depth<0) {
1054                                         ctx->submatch_id = subid;
1055                                         return REG_OK;
1056                                 }
1057                                 if (!c || depth<0)
1058                                         return REG_EPAREN;
1059                                 nbranch = tre_stack_pop_voidptr(stack);
1060                                 nunion = tre_stack_pop_voidptr(stack);
1061                                 goto parse_iter;
1062                         }
1063                 }
1064         }
1065 }
1066
1067
1068 /***********************************************************************
1069  from tre-compile.c
1070 ***********************************************************************/
1071
1072
1073 /*
1074   TODO:
1075    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1076      function calls.
1077 */
1078
1079 /*
1080   Algorithms to setup tags so that submatch addressing can be done.
1081 */
1082
1083
1084 /* Inserts a catenation node to the root of the tree given in `node'.
1085    As the left child a new tag with number `tag_id' to `node' is added,
1086    and the right child is the old root. */
1087 static reg_errcode_t
1088 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1089 {
1090   tre_catenation_t *c;
1091
1092   c = tre_mem_alloc(mem, sizeof(*c));
1093   if (c == NULL)
1094     return REG_ESPACE;
1095   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1096   if (c->left == NULL)
1097     return REG_ESPACE;
1098   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1099   if (c->right == NULL)
1100     return REG_ESPACE;
1101
1102   c->right->obj = node->obj;
1103   c->right->type = node->type;
1104   c->right->nullable = -1;
1105   c->right->submatch_id = -1;
1106   c->right->firstpos = NULL;
1107   c->right->lastpos = NULL;
1108   c->right->num_tags = 0;
1109   c->right->num_submatches = 0;
1110   node->obj = c;
1111   node->type = CATENATION;
1112   return REG_OK;
1113 }
1114
1115 /* Inserts a catenation node to the root of the tree given in `node'.
1116    As the right child a new tag with number `tag_id' to `node' is added,
1117    and the left child is the old root. */
1118 static reg_errcode_t
1119 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1120 {
1121   tre_catenation_t *c;
1122
1123   c = tre_mem_alloc(mem, sizeof(*c));
1124   if (c == NULL)
1125     return REG_ESPACE;
1126   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1127   if (c->right == NULL)
1128     return REG_ESPACE;
1129   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1130   if (c->left == NULL)
1131     return REG_ESPACE;
1132
1133   c->left->obj = node->obj;
1134   c->left->type = node->type;
1135   c->left->nullable = -1;
1136   c->left->submatch_id = -1;
1137   c->left->firstpos = NULL;
1138   c->left->lastpos = NULL;
1139   c->left->num_tags = 0;
1140   c->left->num_submatches = 0;
1141   node->obj = c;
1142   node->type = CATENATION;
1143   return REG_OK;
1144 }
1145
1146 typedef enum {
1147   ADDTAGS_RECURSE,
1148   ADDTAGS_AFTER_ITERATION,
1149   ADDTAGS_AFTER_UNION_LEFT,
1150   ADDTAGS_AFTER_UNION_RIGHT,
1151   ADDTAGS_AFTER_CAT_LEFT,
1152   ADDTAGS_AFTER_CAT_RIGHT,
1153   ADDTAGS_SET_SUBMATCH_END
1154 } tre_addtags_symbol_t;
1155
1156
1157 typedef struct {
1158   int tag;
1159   int next_tag;
1160 } tre_tag_states_t;
1161
1162
1163 /* Go through `regset' and set submatch data for submatches that are
1164    using this tag. */
1165 static void
1166 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1167 {
1168   int i;
1169
1170   for (i = 0; regset[i] >= 0; i++)
1171     {
1172       int id = regset[i] / 2;
1173       int start = !(regset[i] % 2);
1174       if (start)
1175         tnfa->submatch_data[id].so_tag = tag;
1176       else
1177         tnfa->submatch_data[id].eo_tag = tag;
1178     }
1179   regset[0] = -1;
1180 }
1181
1182
1183 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1184    subexpressions marked for submatch addressing can be traced. */
1185 static reg_errcode_t
1186 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1187              tre_tnfa_t *tnfa)
1188 {
1189   reg_errcode_t status = REG_OK;
1190   tre_addtags_symbol_t symbol;
1191   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1192   int bottom = tre_stack_num_objects(stack);
1193   /* True for first pass (counting number of needed tags) */
1194   int first_pass = (mem == NULL || tnfa == NULL);
1195   int *regset, *orig_regset;
1196   int num_tags = 0; /* Total number of tags. */
1197   int num_minimals = 0;  /* Number of special minimal tags. */
1198   int tag = 0;      /* The tag that is to be added next. */
1199   int next_tag = 1; /* Next tag to use after this one. */
1200   int *parents;     /* Stack of submatches the current submatch is
1201                        contained in. */
1202   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1203   tre_tag_states_t *saved_states;
1204
1205   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1206   if (!first_pass)
1207     {
1208       tnfa->end_tag = 0;
1209       tnfa->minimal_tags[0] = -1;
1210     }
1211
1212   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1213   if (regset == NULL)
1214     return REG_ESPACE;
1215   regset[0] = -1;
1216   orig_regset = regset;
1217
1218   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1219   if (parents == NULL)
1220     {
1221       xfree(regset);
1222       return REG_ESPACE;
1223     }
1224   parents[0] = -1;
1225
1226   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1227   if (saved_states == NULL)
1228     {
1229       xfree(regset);
1230       xfree(parents);
1231       return REG_ESPACE;
1232     }
1233   else
1234     {
1235       unsigned int i;
1236       for (i = 0; i <= tnfa->num_submatches; i++)
1237         saved_states[i].tag = -1;
1238     }
1239
1240   STACK_PUSH(stack, voidptr, node);
1241   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1242
1243   while (tre_stack_num_objects(stack) > bottom)
1244     {
1245       if (status != REG_OK)
1246         break;
1247
1248       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1249       switch (symbol)
1250         {
1251
1252         case ADDTAGS_SET_SUBMATCH_END:
1253           {
1254             int id = tre_stack_pop_int(stack);
1255             int i;
1256
1257             /* Add end of this submatch to regset. */
1258             for (i = 0; regset[i] >= 0; i++);
1259             regset[i] = id * 2 + 1;
1260             regset[i + 1] = -1;
1261
1262             /* Pop this submatch from the parents stack. */
1263             for (i = 0; parents[i] >= 0; i++);
1264             parents[i - 1] = -1;
1265             break;
1266           }
1267
1268         case ADDTAGS_RECURSE:
1269           node = tre_stack_pop_voidptr(stack);
1270
1271           if (node->submatch_id >= 0)
1272             {
1273               int id = node->submatch_id;
1274               int i;
1275
1276
1277               /* Add start of this submatch to regset. */
1278               for (i = 0; regset[i] >= 0; i++);
1279               regset[i] = id * 2;
1280               regset[i + 1] = -1;
1281
1282               if (!first_pass)
1283                 {
1284                   for (i = 0; parents[i] >= 0; i++);
1285                   tnfa->submatch_data[id].parents = NULL;
1286                   if (i > 0)
1287                     {
1288                       int *p = xmalloc(sizeof(*p) * (i + 1));
1289                       if (p == NULL)
1290                         {
1291                           status = REG_ESPACE;
1292                           break;
1293                         }
1294                       assert(tnfa->submatch_data[id].parents == NULL);
1295                       tnfa->submatch_data[id].parents = p;
1296                       for (i = 0; parents[i] >= 0; i++)
1297                         p[i] = parents[i];
1298                       p[i] = -1;
1299                     }
1300                 }
1301
1302               /* Add end of this submatch to regset after processing this
1303                  node. */
1304               STACK_PUSHX(stack, int, node->submatch_id);
1305               STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1306             }
1307
1308           switch (node->type)
1309             {
1310             case LITERAL:
1311               {
1312                 tre_literal_t *lit = node->obj;
1313
1314                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1315                   {
1316                     int i;
1317                     if (regset[0] >= 0)
1318                       {
1319                         /* Regset is not empty, so add a tag before the
1320                            literal or backref. */
1321                         if (!first_pass)
1322                           {
1323                             status = tre_add_tag_left(mem, node, tag);
1324                             tnfa->tag_directions[tag] = direction;
1325                             if (minimal_tag >= 0)
1326                               {
1327                                 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1328                                 tnfa->minimal_tags[i] = tag;
1329                                 tnfa->minimal_tags[i + 1] = minimal_tag;
1330                                 tnfa->minimal_tags[i + 2] = -1;
1331                                 minimal_tag = -1;
1332                                 num_minimals++;
1333                               }
1334                             tre_purge_regset(regset, tnfa, tag);
1335                           }
1336                         else
1337                           {
1338                             node->num_tags = 1;
1339                           }
1340
1341                         regset[0] = -1;
1342                         tag = next_tag;
1343                         num_tags++;
1344                         next_tag++;
1345                       }
1346                   }
1347                 else
1348                   {
1349                     assert(!IS_TAG(lit));
1350                   }
1351                 break;
1352               }
1353             case CATENATION:
1354               {
1355                 tre_catenation_t *cat = node->obj;
1356                 tre_ast_node_t *left = cat->left;
1357                 tre_ast_node_t *right = cat->right;
1358                 int reserved_tag = -1;
1359
1360
1361                 /* After processing right child. */
1362                 STACK_PUSHX(stack, voidptr, node);
1363                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1364
1365                 /* Process right child. */
1366                 STACK_PUSHX(stack, voidptr, right);
1367                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1368
1369                 /* After processing left child. */
1370                 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1371                 if (left->num_tags > 0 && right->num_tags > 0)
1372                   {
1373                     /* Reserve the next tag to the right child. */
1374                     reserved_tag = next_tag;
1375                     next_tag++;
1376                   }
1377                 STACK_PUSHX(stack, int, reserved_tag);
1378                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1379
1380                 /* Process left child. */
1381                 STACK_PUSHX(stack, voidptr, left);
1382                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1383
1384                 }
1385               break;
1386             case ITERATION:
1387               {
1388                 tre_iteration_t *iter = node->obj;
1389
1390                 if (first_pass)
1391                   {
1392                     STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1393                   }
1394                 else
1395                   {
1396                     STACK_PUSHX(stack, int, tag);
1397                     STACK_PUSHX(stack, int, iter->minimal);
1398                   }
1399                 STACK_PUSHX(stack, voidptr, node);
1400                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1401
1402                 STACK_PUSHX(stack, voidptr, iter->arg);
1403                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1404
1405                 /* Regset is not empty, so add a tag here. */
1406                 if (regset[0] >= 0 || iter->minimal)
1407                   {
1408                     if (!first_pass)
1409                       {
1410                         int i;
1411                         status = tre_add_tag_left(mem, node, tag);
1412                         if (iter->minimal)
1413                           tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1414                         else
1415                           tnfa->tag_directions[tag] = direction;
1416                         if (minimal_tag >= 0)
1417                           {
1418                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1419                             tnfa->minimal_tags[i] = tag;
1420                             tnfa->minimal_tags[i + 1] = minimal_tag;
1421                             tnfa->minimal_tags[i + 2] = -1;
1422                             minimal_tag = -1;
1423                             num_minimals++;
1424                           }
1425                         tre_purge_regset(regset, tnfa, tag);
1426                       }
1427
1428                     regset[0] = -1;
1429                     tag = next_tag;
1430                     num_tags++;
1431                     next_tag++;
1432                   }
1433                 direction = TRE_TAG_MINIMIZE;
1434               }
1435               break;
1436             case UNION:
1437               {
1438                 tre_union_t *uni = node->obj;
1439                 tre_ast_node_t *left = uni->left;
1440                 tre_ast_node_t *right = uni->right;
1441                 int left_tag;
1442                 int right_tag;
1443
1444                 if (regset[0] >= 0)
1445                   {
1446                     left_tag = next_tag;
1447                     right_tag = next_tag + 1;
1448                   }
1449                 else
1450                   {
1451                     left_tag = tag;
1452                     right_tag = next_tag;
1453                   }
1454
1455                 /* After processing right child. */
1456                 STACK_PUSHX(stack, int, right_tag);
1457                 STACK_PUSHX(stack, int, left_tag);
1458                 STACK_PUSHX(stack, voidptr, regset);
1459                 STACK_PUSHX(stack, int, regset[0] >= 0);
1460                 STACK_PUSHX(stack, voidptr, node);
1461                 STACK_PUSHX(stack, voidptr, right);
1462                 STACK_PUSHX(stack, voidptr, left);
1463                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1464
1465                 /* Process right child. */
1466                 STACK_PUSHX(stack, voidptr, right);
1467                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1468
1469                 /* After processing left child. */
1470                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1471
1472                 /* Process left child. */
1473                 STACK_PUSHX(stack, voidptr, left);
1474                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1475
1476                 /* Regset is not empty, so add a tag here. */
1477                 if (regset[0] >= 0)
1478                   {
1479                     if (!first_pass)
1480                       {
1481                         int i;
1482                         status = tre_add_tag_left(mem, node, tag);
1483                         tnfa->tag_directions[tag] = direction;
1484                         if (minimal_tag >= 0)
1485                           {
1486                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1487                             tnfa->minimal_tags[i] = tag;
1488                             tnfa->minimal_tags[i + 1] = minimal_tag;
1489                             tnfa->minimal_tags[i + 2] = -1;
1490                             minimal_tag = -1;
1491                             num_minimals++;
1492                           }
1493                         tre_purge_regset(regset, tnfa, tag);
1494                       }
1495
1496                     regset[0] = -1;
1497                     tag = next_tag;
1498                     num_tags++;
1499                     next_tag++;
1500                   }
1501
1502                 if (node->num_submatches > 0)
1503                   {
1504                     /* The next two tags are reserved for markers. */
1505                     next_tag++;
1506                     tag = next_tag;
1507                     next_tag++;
1508                   }
1509
1510                 break;
1511               }
1512             }
1513
1514           if (node->submatch_id >= 0)
1515             {
1516               int i;
1517               /* Push this submatch on the parents stack. */
1518               for (i = 0; parents[i] >= 0; i++);
1519               parents[i] = node->submatch_id;
1520               parents[i + 1] = -1;
1521             }
1522
1523           break; /* end case: ADDTAGS_RECURSE */
1524
1525         case ADDTAGS_AFTER_ITERATION:
1526           {
1527             int minimal = 0;
1528             int enter_tag;
1529             node = tre_stack_pop_voidptr(stack);
1530             if (first_pass)
1531               {
1532                 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1533                   + tre_stack_pop_int(stack);
1534                 minimal_tag = -1;
1535               }
1536             else
1537               {
1538                 minimal = tre_stack_pop_int(stack);
1539                 enter_tag = tre_stack_pop_int(stack);
1540                 if (minimal)
1541                   minimal_tag = enter_tag;
1542               }
1543
1544             if (!first_pass)
1545               {
1546                 if (minimal)
1547                   direction = TRE_TAG_MINIMIZE;
1548                 else
1549                   direction = TRE_TAG_MAXIMIZE;
1550               }
1551             break;
1552           }
1553
1554         case ADDTAGS_AFTER_CAT_LEFT:
1555           {
1556             int new_tag = tre_stack_pop_int(stack);
1557             next_tag = tre_stack_pop_int(stack);
1558             if (new_tag >= 0)
1559               {
1560                 tag = new_tag;
1561               }
1562             break;
1563           }
1564
1565         case ADDTAGS_AFTER_CAT_RIGHT:
1566           node = tre_stack_pop_voidptr(stack);
1567           if (first_pass)
1568             node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1569               + ((tre_catenation_t *)node->obj)->right->num_tags;
1570           break;
1571
1572         case ADDTAGS_AFTER_UNION_LEFT:
1573           /* Lift the bottom of the `regset' array so that when processing
1574              the right operand the items currently in the array are
1575              invisible.  The original bottom was saved at ADDTAGS_UNION and
1576              will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1577           while (*regset >= 0)
1578             regset++;
1579           break;
1580
1581         case ADDTAGS_AFTER_UNION_RIGHT:
1582           {
1583             int added_tags, tag_left, tag_right;
1584             tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1585             tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1586             node = tre_stack_pop_voidptr(stack);
1587             added_tags = tre_stack_pop_int(stack);
1588             if (first_pass)
1589               {
1590                 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1591                   + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1592                   + ((node->num_submatches > 0) ? 2 : 0);
1593               }
1594             regset = tre_stack_pop_voidptr(stack);
1595             tag_left = tre_stack_pop_int(stack);
1596             tag_right = tre_stack_pop_int(stack);
1597
1598             /* Add tags after both children, the left child gets a smaller
1599                tag than the right child.  This guarantees that we prefer
1600                the left child over the right child. */
1601             /* XXX - This is not always necessary (if the children have
1602                tags which must be seen for every match of that child). */
1603             /* XXX - Check if this is the only place where tre_add_tag_right
1604                is used.  If so, use tre_add_tag_left (putting the tag before
1605                the child as opposed after the child) and throw away
1606                tre_add_tag_right. */
1607             if (node->num_submatches > 0)
1608               {
1609                 if (!first_pass)
1610                   {
1611                     status = tre_add_tag_right(mem, left, tag_left);
1612                     tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1613                     if (status == REG_OK)
1614                       status = tre_add_tag_right(mem, right, tag_right);
1615                     tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1616                   }
1617                 num_tags += 2;
1618               }
1619             direction = TRE_TAG_MAXIMIZE;
1620             break;
1621           }
1622
1623         default:
1624           assert(0);
1625           break;
1626
1627         } /* end switch(symbol) */
1628     } /* end while(tre_stack_num_objects(stack) > bottom) */
1629
1630   if (!first_pass)
1631     tre_purge_regset(regset, tnfa, tag);
1632
1633   if (!first_pass && minimal_tag >= 0)
1634     {
1635       int i;
1636       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1637       tnfa->minimal_tags[i] = tag;
1638       tnfa->minimal_tags[i + 1] = minimal_tag;
1639       tnfa->minimal_tags[i + 2] = -1;
1640       minimal_tag = -1;
1641       num_minimals++;
1642     }
1643
1644   assert(tree->num_tags == num_tags);
1645   tnfa->end_tag = num_tags;
1646   tnfa->num_tags = num_tags;
1647   tnfa->num_minimals = num_minimals;
1648   xfree(orig_regset);
1649   xfree(parents);
1650   xfree(saved_states);
1651   return status;
1652 }
1653
1654
1655
1656 /*
1657   AST to TNFA compilation routines.
1658 */
1659
1660 typedef enum {
1661   COPY_RECURSE,
1662   COPY_SET_RESULT_PTR
1663 } tre_copyast_symbol_t;
1664
1665 /* Flags for tre_copy_ast(). */
1666 #define COPY_REMOVE_TAGS         1
1667 #define COPY_MAXIMIZE_FIRST_TAG  2
1668
1669 static reg_errcode_t
1670 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1671              int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1672              tre_ast_node_t **copy, int *max_pos)
1673 {
1674   reg_errcode_t status = REG_OK;
1675   int bottom = tre_stack_num_objects(stack);
1676   int num_copied = 0;
1677   int first_tag = 1;
1678   tre_ast_node_t **result = copy;
1679   tre_copyast_symbol_t symbol;
1680
1681   STACK_PUSH(stack, voidptr, ast);
1682   STACK_PUSH(stack, int, COPY_RECURSE);
1683
1684   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1685     {
1686       tre_ast_node_t *node;
1687       if (status != REG_OK)
1688         break;
1689
1690       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1691       switch (symbol)
1692         {
1693         case COPY_SET_RESULT_PTR:
1694           result = tre_stack_pop_voidptr(stack);
1695           break;
1696         case COPY_RECURSE:
1697           node = tre_stack_pop_voidptr(stack);
1698           switch (node->type)
1699             {
1700             case LITERAL:
1701               {
1702                 tre_literal_t *lit = node->obj;
1703                 int pos = lit->position;
1704                 int min = lit->code_min;
1705                 int max = lit->code_max;
1706                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1707                   {
1708                     /* XXX - e.g. [ab] has only one position but two
1709                        nodes, so we are creating holes in the state space
1710                        here.  Not fatal, just wastes memory. */
1711                     pos += *pos_add;
1712                     num_copied++;
1713                   }
1714                 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1715                   {
1716                     /* Change this tag to empty. */
1717                     min = EMPTY;
1718                     max = pos = -1;
1719                   }
1720                 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1721                          && first_tag)
1722                   {
1723                     /* Maximize the first tag. */
1724                     tag_directions[max] = TRE_TAG_MAXIMIZE;
1725                     first_tag = 0;
1726                   }
1727                 *result = tre_ast_new_literal(mem, min, max, pos);
1728                 if (*result == NULL)
1729                   status = REG_ESPACE;
1730                 else {
1731                   tre_literal_t *p = (*result)->obj;
1732                   p->class = lit->class;
1733                   p->neg_classes = lit->neg_classes;
1734                 }
1735
1736                 if (pos > *max_pos)
1737                   *max_pos = pos;
1738                 break;
1739               }
1740             case UNION:
1741               {
1742                 tre_union_t *uni = node->obj;
1743                 tre_union_t *tmp;
1744                 *result = tre_ast_new_union(mem, uni->left, uni->right);
1745                 if (*result == NULL)
1746                   {
1747                     status = REG_ESPACE;
1748                     break;
1749                   }
1750                 tmp = (*result)->obj;
1751                 result = &tmp->left;
1752                 STACK_PUSHX(stack, voidptr, uni->right);
1753                 STACK_PUSHX(stack, int, COPY_RECURSE);
1754                 STACK_PUSHX(stack, voidptr, &tmp->right);
1755                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1756                 STACK_PUSHX(stack, voidptr, uni->left);
1757                 STACK_PUSHX(stack, int, COPY_RECURSE);
1758                 break;
1759               }
1760             case CATENATION:
1761               {
1762                 tre_catenation_t *cat = node->obj;
1763                 tre_catenation_t *tmp;
1764                 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
1765                 if (*result == NULL)
1766                   {
1767                     status = REG_ESPACE;
1768                     break;
1769                   }
1770                 tmp = (*result)->obj;
1771                 tmp->left = NULL;
1772                 tmp->right = NULL;
1773                 result = &tmp->left;
1774
1775                 STACK_PUSHX(stack, voidptr, cat->right);
1776                 STACK_PUSHX(stack, int, COPY_RECURSE);
1777                 STACK_PUSHX(stack, voidptr, &tmp->right);
1778                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1779                 STACK_PUSHX(stack, voidptr, cat->left);
1780                 STACK_PUSHX(stack, int, COPY_RECURSE);
1781                 break;
1782               }
1783             case ITERATION:
1784               {
1785                 tre_iteration_t *iter = node->obj;
1786                 STACK_PUSHX(stack, voidptr, iter->arg);
1787                 STACK_PUSHX(stack, int, COPY_RECURSE);
1788                 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
1789                                            iter->max, iter->minimal);
1790                 if (*result == NULL)
1791                   {
1792                     status = REG_ESPACE;
1793                     break;
1794                   }
1795                 iter = (*result)->obj;
1796                 result = &iter->arg;
1797                 break;
1798               }
1799             default:
1800               assert(0);
1801               break;
1802             }
1803           break;
1804         }
1805     }
1806   *pos_add += num_copied;
1807   return status;
1808 }
1809
1810 typedef enum {
1811   EXPAND_RECURSE,
1812   EXPAND_AFTER_ITER
1813 } tre_expand_ast_symbol_t;
1814
1815 /* Expands each iteration node that has a finite nonzero minimum or maximum
1816    iteration count to a catenated sequence of copies of the node. */
1817 static reg_errcode_t
1818 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1819                int *position, tre_tag_direction_t *tag_directions)
1820 {
1821   reg_errcode_t status = REG_OK;
1822   int bottom = tre_stack_num_objects(stack);
1823   int pos_add = 0;
1824   int pos_add_total = 0;
1825   int max_pos = 0;
1826   int iter_depth = 0;
1827
1828   STACK_PUSHR(stack, voidptr, ast);
1829   STACK_PUSHR(stack, int, EXPAND_RECURSE);
1830   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1831     {
1832       tre_ast_node_t *node;
1833       tre_expand_ast_symbol_t symbol;
1834
1835       if (status != REG_OK)
1836         break;
1837
1838       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1839       node = tre_stack_pop_voidptr(stack);
1840       switch (symbol)
1841         {
1842         case EXPAND_RECURSE:
1843           switch (node->type)
1844             {
1845             case LITERAL:
1846               {
1847                 tre_literal_t *lit= node->obj;
1848                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1849                   {
1850                     lit->position += pos_add;
1851                     if (lit->position > max_pos)
1852                       max_pos = lit->position;
1853                   }
1854                 break;
1855               }
1856             case UNION:
1857               {
1858                 tre_union_t *uni = node->obj;
1859                 STACK_PUSHX(stack, voidptr, uni->right);
1860                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1861                 STACK_PUSHX(stack, voidptr, uni->left);
1862                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1863                 break;
1864               }
1865             case CATENATION:
1866               {
1867                 tre_catenation_t *cat = node->obj;
1868                 STACK_PUSHX(stack, voidptr, cat->right);
1869                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1870                 STACK_PUSHX(stack, voidptr, cat->left);
1871                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1872                 break;
1873               }
1874             case ITERATION:
1875               {
1876                 tre_iteration_t *iter = node->obj;
1877                 STACK_PUSHX(stack, int, pos_add);
1878                 STACK_PUSHX(stack, voidptr, node);
1879                 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1880                 STACK_PUSHX(stack, voidptr, iter->arg);
1881                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1882                 /* If we are going to expand this node at EXPAND_AFTER_ITER
1883                    then don't increase the `pos' fields of the nodes now, it
1884                    will get done when expanding. */
1885                 if (iter->min > 1 || iter->max > 1)
1886                   pos_add = 0;
1887                 iter_depth++;
1888                 break;
1889               }
1890             default:
1891               assert(0);
1892               break;
1893             }
1894           break;
1895         case EXPAND_AFTER_ITER:
1896           {
1897             tre_iteration_t *iter = node->obj;
1898             int pos_add_last;
1899             pos_add = tre_stack_pop_int(stack);
1900             pos_add_last = pos_add;
1901             if (iter->min > 1 || iter->max > 1)
1902               {
1903                 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1904                 int j;
1905                 int pos_add_save = pos_add;
1906
1907                 /* Create a catenated sequence of copies of the node. */
1908                 for (j = 0; j < iter->min; j++)
1909                   {
1910                     tre_ast_node_t *copy;
1911                     /* Remove tags from all but the last copy. */
1912                     int flags = ((j + 1 < iter->min)
1913                                  ? COPY_REMOVE_TAGS
1914                                  : COPY_MAXIMIZE_FIRST_TAG);
1915                     pos_add_save = pos_add;
1916                     status = tre_copy_ast(mem, stack, iter->arg, flags,
1917                                           &pos_add, tag_directions, &copy,
1918                                           &max_pos);
1919                     if (status != REG_OK)
1920                       return status;
1921                     if (seq1 != NULL)
1922                       seq1 = tre_ast_new_catenation(mem, seq1, copy);
1923                     else
1924                       seq1 = copy;
1925                     if (seq1 == NULL)
1926                       return REG_ESPACE;
1927                   }
1928
1929                 if (iter->max == -1)
1930                   {
1931                     /* No upper limit. */
1932                     pos_add_save = pos_add;
1933                     status = tre_copy_ast(mem, stack, iter->arg, 0,
1934                                           &pos_add, NULL, &seq2, &max_pos);
1935                     if (status != REG_OK)
1936                       return status;
1937                     seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1938                     if (seq2 == NULL)
1939                       return REG_ESPACE;
1940                   }
1941                 else
1942                   {
1943                     for (j = iter->min; j < iter->max; j++)
1944                       {
1945                         tre_ast_node_t *tmp, *copy;
1946                         pos_add_save = pos_add;
1947                         status = tre_copy_ast(mem, stack, iter->arg, 0,
1948                                               &pos_add, NULL, &copy, &max_pos);
1949                         if (status != REG_OK)
1950                           return status;
1951                         if (seq2 != NULL)
1952                           seq2 = tre_ast_new_catenation(mem, copy, seq2);
1953                         else
1954                           seq2 = copy;
1955                         if (seq2 == NULL)
1956                           return REG_ESPACE;
1957                         tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1958                         if (tmp == NULL)
1959                           return REG_ESPACE;
1960                         seq2 = tre_ast_new_union(mem, tmp, seq2);
1961                         if (seq2 == NULL)
1962                           return REG_ESPACE;
1963                       }
1964                   }
1965
1966                 pos_add = pos_add_save;
1967                 if (seq1 == NULL)
1968                   seq1 = seq2;
1969                 else if (seq2 != NULL)
1970                   seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1971                 if (seq1 == NULL)
1972                   return REG_ESPACE;
1973                 node->obj = seq1->obj;
1974                 node->type = seq1->type;
1975               }
1976
1977             iter_depth--;
1978             pos_add_total += pos_add - pos_add_last;
1979             if (iter_depth == 0)
1980               pos_add = pos_add_total;
1981
1982             break;
1983           }
1984         default:
1985           assert(0);
1986           break;
1987         }
1988     }
1989
1990   *position += pos_add_total;
1991
1992   /* `max_pos' should never be larger than `*position' if the above
1993      code works, but just an extra safeguard let's make sure
1994      `*position' is set large enough so enough memory will be
1995      allocated for the transition table. */
1996   if (max_pos > *position)
1997     *position = max_pos;
1998
1999   return status;
2000 }
2001
2002 static tre_pos_and_tags_t *
2003 tre_set_empty(tre_mem_t mem)
2004 {
2005   tre_pos_and_tags_t *new_set;
2006
2007   new_set = tre_mem_calloc(mem, sizeof(*new_set));
2008   if (new_set == NULL)
2009     return NULL;
2010
2011   new_set[0].position = -1;
2012   new_set[0].code_min = -1;
2013   new_set[0].code_max = -1;
2014
2015   return new_set;
2016 }
2017
2018 static tre_pos_and_tags_t *
2019 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2020             tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2021 {
2022   tre_pos_and_tags_t *new_set;
2023
2024   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2025   if (new_set == NULL)
2026     return NULL;
2027
2028   new_set[0].position = position;
2029   new_set[0].code_min = code_min;
2030   new_set[0].code_max = code_max;
2031   new_set[0].class = class;
2032   new_set[0].neg_classes = neg_classes;
2033   new_set[0].backref = backref;
2034   new_set[1].position = -1;
2035   new_set[1].code_min = -1;
2036   new_set[1].code_max = -1;
2037
2038   return new_set;
2039 }
2040
2041 static tre_pos_and_tags_t *
2042 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2043               int *tags, int assertions)
2044 {
2045   int s1, s2, i, j;
2046   tre_pos_and_tags_t *new_set;
2047   int *new_tags;
2048   int num_tags;
2049
2050   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2051   for (s1 = 0; set1[s1].position >= 0; s1++);
2052   for (s2 = 0; set2[s2].position >= 0; s2++);
2053   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2054   if (!new_set )
2055     return NULL;
2056
2057   for (s1 = 0; set1[s1].position >= 0; s1++)
2058     {
2059       new_set[s1].position = set1[s1].position;
2060       new_set[s1].code_min = set1[s1].code_min;
2061       new_set[s1].code_max = set1[s1].code_max;
2062       new_set[s1].assertions = set1[s1].assertions | assertions;
2063       new_set[s1].class = set1[s1].class;
2064       new_set[s1].neg_classes = set1[s1].neg_classes;
2065       new_set[s1].backref = set1[s1].backref;
2066       if (set1[s1].tags == NULL && tags == NULL)
2067         new_set[s1].tags = NULL;
2068       else
2069         {
2070           for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2071           new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2072                                          * (i + num_tags + 1)));
2073           if (new_tags == NULL)
2074             return NULL;
2075           for (j = 0; j < i; j++)
2076             new_tags[j] = set1[s1].tags[j];
2077           for (i = 0; i < num_tags; i++)
2078             new_tags[j + i] = tags[i];
2079           new_tags[j + i] = -1;
2080           new_set[s1].tags = new_tags;
2081         }
2082     }
2083
2084   for (s2 = 0; set2[s2].position >= 0; s2++)
2085     {
2086       new_set[s1 + s2].position = set2[s2].position;
2087       new_set[s1 + s2].code_min = set2[s2].code_min;
2088       new_set[s1 + s2].code_max = set2[s2].code_max;
2089       /* XXX - why not | assertions here as well? */
2090       new_set[s1 + s2].assertions = set2[s2].assertions;
2091       new_set[s1 + s2].class = set2[s2].class;
2092       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2093       new_set[s1 + s2].backref = set2[s2].backref;
2094       if (set2[s2].tags == NULL)
2095         new_set[s1 + s2].tags = NULL;
2096       else
2097         {
2098           for (i = 0; set2[s2].tags[i] >= 0; i++);
2099           new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2100           if (new_tags == NULL)
2101             return NULL;
2102           for (j = 0; j < i; j++)
2103             new_tags[j] = set2[s2].tags[j];
2104           new_tags[j] = -1;
2105           new_set[s1 + s2].tags = new_tags;
2106         }
2107     }
2108   new_set[s1 + s2].position = -1;
2109   return new_set;
2110 }
2111
2112 /* Finds the empty path through `node' which is the one that should be
2113    taken according to POSIX.2 rules, and adds the tags on that path to
2114    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
2115    set to the number of tags seen on the path. */
2116 static reg_errcode_t
2117 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2118                 int *assertions, int *num_tags_seen)
2119 {
2120   tre_literal_t *lit;
2121   tre_union_t *uni;
2122   tre_catenation_t *cat;
2123   tre_iteration_t *iter;
2124   int i;
2125   int bottom = tre_stack_num_objects(stack);
2126   reg_errcode_t status = REG_OK;
2127   if (num_tags_seen)
2128     *num_tags_seen = 0;
2129
2130   status = tre_stack_push_voidptr(stack, node);
2131
2132   /* Walk through the tree recursively. */
2133   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2134     {
2135       node = tre_stack_pop_voidptr(stack);
2136
2137       switch (node->type)
2138         {
2139         case LITERAL:
2140           lit = (tre_literal_t *)node->obj;
2141           switch (lit->code_min)
2142             {
2143             case TAG:
2144               if (lit->code_max >= 0)
2145                 {
2146                   if (tags != NULL)
2147                     {
2148                       /* Add the tag to `tags'. */
2149                       for (i = 0; tags[i] >= 0; i++)
2150                         if (tags[i] == lit->code_max)
2151                           break;
2152                       if (tags[i] < 0)
2153                         {
2154                           tags[i] = lit->code_max;
2155                           tags[i + 1] = -1;
2156                         }
2157                     }
2158                   if (num_tags_seen)
2159                     (*num_tags_seen)++;
2160                 }
2161               break;
2162             case ASSERTION:
2163               assert(lit->code_max >= 1
2164                      || lit->code_max <= ASSERT_LAST);
2165               if (assertions != NULL)
2166                 *assertions |= lit->code_max;
2167               break;
2168             case EMPTY:
2169               break;
2170             default:
2171               assert(0);
2172               break;
2173             }
2174           break;
2175
2176         case UNION:
2177           /* Subexpressions starting earlier take priority over ones
2178              starting later, so we prefer the left subexpression over the
2179              right subexpression. */
2180           uni = (tre_union_t *)node->obj;
2181           if (uni->left->nullable)
2182             STACK_PUSHX(stack, voidptr, uni->left)
2183           else if (uni->right->nullable)
2184             STACK_PUSHX(stack, voidptr, uni->right)
2185           else
2186             assert(0);
2187           break;
2188
2189         case CATENATION:
2190           /* The path must go through both children. */
2191           cat = (tre_catenation_t *)node->obj;
2192           assert(cat->left->nullable);
2193           assert(cat->right->nullable);
2194           STACK_PUSHX(stack, voidptr, cat->left);
2195           STACK_PUSHX(stack, voidptr, cat->right);
2196           break;
2197
2198         case ITERATION:
2199           /* A match with an empty string is preferred over no match at
2200              all, so we go through the argument if possible. */
2201           iter = (tre_iteration_t *)node->obj;
2202           if (iter->arg->nullable)
2203             STACK_PUSHX(stack, voidptr, iter->arg);
2204           break;
2205
2206         default:
2207           assert(0);
2208           break;
2209         }
2210     }
2211
2212   return status;
2213 }
2214
2215
2216 typedef enum {
2217   NFL_RECURSE,
2218   NFL_POST_UNION,
2219   NFL_POST_CATENATION,
2220   NFL_POST_ITERATION
2221 } tre_nfl_stack_symbol_t;
2222
2223
2224 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2225    the nodes of the AST `tree'. */
2226 static reg_errcode_t
2227 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2228 {
2229   int bottom = tre_stack_num_objects(stack);
2230
2231   STACK_PUSHR(stack, voidptr, tree);
2232   STACK_PUSHR(stack, int, NFL_RECURSE);
2233
2234   while (tre_stack_num_objects(stack) > bottom)
2235     {
2236       tre_nfl_stack_symbol_t symbol;
2237       tre_ast_node_t *node;
2238
2239       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2240       node = tre_stack_pop_voidptr(stack);
2241       switch (symbol)
2242         {
2243         case NFL_RECURSE:
2244           switch (node->type)
2245             {
2246             case LITERAL:
2247               {
2248                 tre_literal_t *lit = (tre_literal_t *)node->obj;
2249                 if (IS_BACKREF(lit))
2250                   {
2251                     /* Back references: nullable = false, firstpos = {i},
2252                        lastpos = {i}. */
2253                     node->nullable = 0;
2254                     node->firstpos = tre_set_one(mem, lit->position, 0,
2255                                              TRE_CHAR_MAX, 0, NULL, -1);
2256                     if (!node->firstpos)
2257                       return REG_ESPACE;
2258                     node->lastpos = tre_set_one(mem, lit->position, 0,
2259                                                 TRE_CHAR_MAX, 0, NULL,
2260                                                 (int)lit->code_max);
2261                     if (!node->lastpos)
2262                       return REG_ESPACE;
2263                   }
2264                 else if (lit->code_min < 0)
2265                   {
2266                     /* Tags, empty strings, params, and zero width assertions:
2267                        nullable = true, firstpos = {}, and lastpos = {}. */
2268                     node->nullable = 1;
2269                     node->firstpos = tre_set_empty(mem);
2270                     if (!node->firstpos)
2271                       return REG_ESPACE;
2272                     node->lastpos = tre_set_empty(mem);
2273                     if (!node->lastpos)
2274                       return REG_ESPACE;
2275                   }
2276                 else
2277                   {
2278                     /* Literal at position i: nullable = false, firstpos = {i},
2279                        lastpos = {i}. */
2280                     node->nullable = 0;
2281                     node->firstpos =
2282                       tre_set_one(mem, lit->position, (int)lit->code_min,
2283                                   (int)lit->code_max, 0, NULL, -1);
2284                     if (!node->firstpos)
2285                       return REG_ESPACE;
2286                     node->lastpos = tre_set_one(mem, lit->position,
2287                                                 (int)lit->code_min,
2288                                                 (int)lit->code_max,
2289                                                 lit->class, lit->neg_classes,
2290                                                 -1);
2291                     if (!node->lastpos)
2292                       return REG_ESPACE;
2293                   }
2294                 break;
2295               }
2296
2297             case UNION:
2298               /* Compute the attributes for the two subtrees, and after that
2299                  for this node. */
2300               STACK_PUSHR(stack, voidptr, node);
2301               STACK_PUSHR(stack, int, NFL_POST_UNION);
2302               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2303               STACK_PUSHR(stack, int, NFL_RECURSE);
2304               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2305               STACK_PUSHR(stack, int, NFL_RECURSE);
2306               break;
2307
2308             case CATENATION:
2309               /* Compute the attributes for the two subtrees, and after that
2310                  for this node. */
2311               STACK_PUSHR(stack, voidptr, node);
2312               STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2313               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2314               STACK_PUSHR(stack, int, NFL_RECURSE);
2315               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2316               STACK_PUSHR(stack, int, NFL_RECURSE);
2317               break;
2318
2319             case ITERATION:
2320               /* Compute the attributes for the subtree, and after that for
2321                  this node. */
2322               STACK_PUSHR(stack, voidptr, node);
2323               STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2324               STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2325               STACK_PUSHR(stack, int, NFL_RECURSE);
2326               break;
2327             }
2328           break; /* end case: NFL_RECURSE */
2329
2330         case NFL_POST_UNION:
2331           {
2332             tre_union_t *uni = (tre_union_t *)node->obj;
2333             node->nullable = uni->left->nullable || uni->right->nullable;
2334             node->firstpos = tre_set_union(mem, uni->left->firstpos,
2335                                            uni->right->firstpos, NULL, 0);
2336             if (!node->firstpos)
2337               return REG_ESPACE;
2338             node->lastpos = tre_set_union(mem, uni->left->lastpos,
2339                                           uni->right->lastpos, NULL, 0);
2340             if (!node->lastpos)
2341               return REG_ESPACE;
2342             break;
2343           }
2344
2345         case NFL_POST_ITERATION:
2346           {
2347             tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2348
2349             if (iter->min == 0 || iter->arg->nullable)
2350               node->nullable = 1;
2351             else
2352               node->nullable = 0;
2353             node->firstpos = iter->arg->firstpos;
2354             node->lastpos = iter->arg->lastpos;
2355             break;
2356           }
2357
2358         case NFL_POST_CATENATION:
2359           {
2360             int num_tags, *tags, assertions;
2361             reg_errcode_t status;
2362             tre_catenation_t *cat = node->obj;
2363             node->nullable = cat->left->nullable && cat->right->nullable;
2364
2365             /* Compute firstpos. */
2366             if (cat->left->nullable)
2367               {
2368                 /* The left side matches the empty string.  Make a first pass
2369                    with tre_match_empty() to get the number of tags and
2370                    parameters. */
2371                 status = tre_match_empty(stack, cat->left,
2372                                          NULL, NULL, &num_tags);
2373                 if (status != REG_OK)
2374                   return status;
2375                 /* Allocate arrays for the tags and parameters. */
2376                 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2377                 if (!tags)
2378                   return REG_ESPACE;
2379                 tags[0] = -1;
2380                 assertions = 0;
2381                 /* Second pass with tre_mach_empty() to get the list of
2382                    tags and parameters. */
2383                 status = tre_match_empty(stack, cat->left, tags,
2384                                          &assertions, NULL);
2385                 if (status != REG_OK)
2386                   {
2387                     xfree(tags);
2388                     return status;
2389                   }
2390                 node->firstpos =
2391                   tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2392                                 tags, assertions);
2393                 xfree(tags);
2394                 if (!node->firstpos)
2395                   return REG_ESPACE;
2396               }
2397             else
2398               {
2399                 node->firstpos = cat->left->firstpos;
2400               }
2401
2402             /* Compute lastpos. */
2403             if (cat->right->nullable)
2404               {
2405                 /* The right side matches the empty string.  Make a first pass
2406                    with tre_match_empty() to get the number of tags and
2407                    parameters. */
2408                 status = tre_match_empty(stack, cat->right,
2409                                          NULL, NULL, &num_tags);
2410                 if (status != REG_OK)
2411                   return status;
2412                 /* Allocate arrays for the tags and parameters. */
2413                 tags = xmalloc(sizeof(int) * (num_tags + 1));
2414                 if (!tags)
2415                   return REG_ESPACE;
2416                 tags[0] = -1;
2417                 assertions = 0;
2418                 /* Second pass with tre_mach_empty() to get the list of
2419                    tags and parameters. */
2420                 status = tre_match_empty(stack, cat->right, tags,
2421                                          &assertions, NULL);
2422                 if (status != REG_OK)
2423                   {
2424                     xfree(tags);
2425                     return status;
2426                   }
2427                 node->lastpos =
2428                   tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2429                                 tags, assertions);
2430                 xfree(tags);
2431                 if (!node->lastpos)
2432                   return REG_ESPACE;
2433               }
2434             else
2435               {
2436                 node->lastpos = cat->right->lastpos;
2437               }
2438             break;
2439           }
2440
2441         default:
2442           assert(0);
2443           break;
2444         }
2445     }
2446
2447   return REG_OK;
2448 }
2449
2450
2451 /* Adds a transition from each position in `p1' to each position in `p2'. */
2452 static reg_errcode_t
2453 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2454                tre_tnfa_transition_t *transitions,
2455                int *counts, int *offs)
2456 {
2457   tre_pos_and_tags_t *orig_p2 = p2;
2458   tre_tnfa_transition_t *trans;
2459   int i, j, k, l, dup, prev_p2_pos;
2460
2461   if (transitions != NULL)
2462     while (p1->position >= 0)
2463       {
2464         p2 = orig_p2;
2465         prev_p2_pos = -1;
2466         while (p2->position >= 0)
2467           {
2468             /* Optimization: if this position was already handled, skip it. */
2469             if (p2->position == prev_p2_pos)
2470               {
2471                 p2++;
2472                 continue;
2473               }
2474             prev_p2_pos = p2->position;
2475             /* Set `trans' to point to the next unused transition from
2476                position `p1->position'. */
2477             trans = transitions + offs[p1->position];
2478             while (trans->state != NULL)
2479               {
2480 #if 0
2481                 /* If we find a previous transition from `p1->position' to
2482                    `p2->position', it is overwritten.  This can happen only
2483                    if there are nested loops in the regexp, like in "((a)*)*".
2484                    In POSIX.2 repetition using the outer loop is always
2485                    preferred over using the inner loop.  Therefore the
2486                    transition for the inner loop is useless and can be thrown
2487                    away. */
2488                 /* XXX - The same position is used for all nodes in a bracket
2489                    expression, so this optimization cannot be used (it will
2490                    break bracket expressions) unless I figure out a way to
2491                    detect it here. */
2492                 if (trans->state_id == p2->position)
2493                   {
2494                     break;
2495                   }
2496 #endif
2497                 trans++;
2498               }
2499
2500             if (trans->state == NULL)
2501               (trans + 1)->state = NULL;
2502             /* Use the character ranges, assertions, etc. from `p1' for
2503                the transition from `p1' to `p2'. */
2504             trans->code_min = p1->code_min;
2505             trans->code_max = p1->code_max;
2506             trans->state = transitions + offs[p2->position];
2507             trans->state_id = p2->position;
2508             trans->assertions = p1->assertions | p2->assertions
2509               | (p1->class ? ASSERT_CHAR_CLASS : 0)
2510               | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2511             if (p1->backref >= 0)
2512               {
2513                 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2514                 assert(p2->backref < 0);
2515                 trans->u.backref = p1->backref;
2516                 trans->assertions |= ASSERT_BACKREF;
2517               }
2518             else
2519               trans->u.class = p1->class;
2520             if (p1->neg_classes != NULL)
2521               {
2522                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2523                 trans->neg_classes =
2524                   xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2525                 if (trans->neg_classes == NULL)
2526                   return REG_ESPACE;
2527                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2528                   trans->neg_classes[i] = p1->neg_classes[i];
2529                 trans->neg_classes[i] = (tre_ctype_t)0;
2530               }
2531             else
2532               trans->neg_classes = NULL;
2533
2534             /* Find out how many tags this transition has. */
2535             i = 0;
2536             if (p1->tags != NULL)
2537               while(p1->tags[i] >= 0)
2538                 i++;
2539             j = 0;
2540             if (p2->tags != NULL)
2541               while(p2->tags[j] >= 0)
2542                 j++;
2543
2544             /* If we are overwriting a transition, free the old tag array. */
2545             if (trans->tags != NULL)
2546               xfree(trans->tags);
2547             trans->tags = NULL;
2548
2549             /* If there were any tags, allocate an array and fill it. */
2550             if (i + j > 0)
2551               {
2552                 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2553                 if (!trans->tags)
2554                   return REG_ESPACE;
2555                 i = 0;
2556                 if (p1->tags != NULL)
2557                   while(p1->tags[i] >= 0)
2558                     {
2559                       trans->tags[i] = p1->tags[i];
2560                       i++;
2561                     }
2562                 l = i;
2563                 j = 0;
2564                 if (p2->tags != NULL)
2565                   while (p2->tags[j] >= 0)
2566                     {
2567                       /* Don't add duplicates. */
2568                       dup = 0;
2569                       for (k = 0; k < i; k++)
2570                         if (trans->tags[k] == p2->tags[j])
2571                           {
2572                             dup = 1;
2573                             break;
2574                           }
2575                       if (!dup)
2576                         trans->tags[l++] = p2->tags[j];
2577                       j++;
2578                     }
2579                 trans->tags[l] = -1;
2580               }
2581
2582             p2++;
2583           }
2584         p1++;
2585       }
2586   else
2587     /* Compute a maximum limit for the number of transitions leaving
2588        from each state. */
2589     while (p1->position >= 0)
2590       {
2591         p2 = orig_p2;
2592         while (p2->position >= 0)
2593           {
2594             counts[p1->position]++;
2595             p2++;
2596           }
2597         p1++;
2598       }
2599   return REG_OK;
2600 }
2601
2602 /* Converts the syntax tree to a TNFA.  All the transitions in the TNFA are
2603    labelled with one character range (there are no transitions on empty
2604    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
2605    the regexp. */
2606 static reg_errcode_t
2607 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2608                 int *counts, int *offs)
2609 {
2610   tre_union_t *uni;
2611   tre_catenation_t *cat;
2612   tre_iteration_t *iter;
2613   reg_errcode_t errcode = REG_OK;
2614
2615   /* XXX - recurse using a stack!. */
2616   switch (node->type)
2617     {
2618     case LITERAL:
2619       break;
2620     case UNION:
2621       uni = (tre_union_t *)node->obj;
2622       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2623       if (errcode != REG_OK)
2624         return errcode;
2625       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2626       break;
2627
2628     case CATENATION:
2629       cat = (tre_catenation_t *)node->obj;
2630       /* Add a transition from each position in cat->left->lastpos
2631          to each position in cat->right->firstpos. */
2632       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2633                                transitions, counts, offs);
2634       if (errcode != REG_OK)
2635         return errcode;
2636       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2637       if (errcode != REG_OK)
2638         return errcode;
2639       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2640       break;
2641
2642     case ITERATION:
2643       iter = (tre_iteration_t *)node->obj;
2644       assert(iter->max == -1 || iter->max == 1);
2645
2646       if (iter->max == -1)
2647         {
2648           assert(iter->min == 0 || iter->min == 1);
2649           /* Add a transition from each last position in the iterated
2650              expression to each first position. */
2651           errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2652                                    transitions, counts, offs);
2653           if (errcode != REG_OK)
2654             return errcode;
2655         }
2656       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2657       break;
2658     }
2659   return errcode;
2660 }
2661
2662
2663 #define ERROR_EXIT(err)           \
2664   do                              \
2665     {                             \
2666       errcode = err;              \
2667       if (/*CONSTCOND*/1)         \
2668         goto error_exit;          \
2669     }                             \
2670  while (/*CONSTCOND*/0)
2671
2672
2673 int
2674 regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
2675 {
2676   tre_stack_t *stack;
2677   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2678   tre_pos_and_tags_t *p;
2679   int *counts = NULL, *offs = NULL;
2680   int i, add = 0;
2681   tre_tnfa_transition_t *transitions, *initial;
2682   tre_tnfa_t *tnfa = NULL;
2683   tre_submatch_data_t *submatch_data;
2684   tre_tag_direction_t *tag_directions = NULL;
2685   reg_errcode_t errcode;
2686   tre_mem_t mem;
2687
2688   /* Parse context. */
2689   tre_parse_ctx_t parse_ctx;
2690
2691   /* Allocate a stack used throughout the compilation process for various
2692      purposes. */
2693   stack = tre_stack_new(512, 1024000, 128);
2694   if (!stack)
2695     return REG_ESPACE;
2696   /* Allocate a fast memory allocator. */
2697   mem = tre_mem_new();
2698   if (!mem)
2699     {
2700       tre_stack_destroy(stack);
2701       return REG_ESPACE;
2702     }
2703
2704   /* Parse the regexp. */
2705   memset(&parse_ctx, 0, sizeof(parse_ctx));
2706   parse_ctx.mem = mem;
2707   parse_ctx.stack = stack;
2708   parse_ctx.re = regex;
2709   parse_ctx.cflags = cflags;
2710   parse_ctx.max_backref = -1;
2711   errcode = tre_parse(&parse_ctx);
2712   if (errcode != REG_OK)
2713     ERROR_EXIT(errcode);
2714   preg->re_nsub = parse_ctx.submatch_id - 1;
2715   tree = parse_ctx.n;
2716
2717 #ifdef TRE_DEBUG
2718   tre_ast_print(tree);
2719 #endif /* TRE_DEBUG */
2720
2721   /* Referring to nonexistent subexpressions is illegal. */
2722   if (parse_ctx.max_backref > (int)preg->re_nsub)
2723     ERROR_EXIT(REG_ESUBREG);
2724
2725   /* Allocate the TNFA struct. */
2726   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2727   if (tnfa == NULL)
2728     ERROR_EXIT(REG_ESPACE);
2729   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2730   tnfa->have_approx = 0;
2731   tnfa->num_submatches = parse_ctx.submatch_id;
2732
2733   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
2734      regexp does not have back references, this can be skipped. */
2735   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
2736     {
2737
2738       /* Figure out how many tags we will need. */
2739       errcode = tre_add_tags(NULL, stack, tree, tnfa);
2740       if (errcode != REG_OK)
2741         ERROR_EXIT(errcode);
2742
2743       if (tnfa->num_tags > 0)
2744         {
2745           tag_directions = xmalloc(sizeof(*tag_directions)
2746                                    * (tnfa->num_tags + 1));
2747           if (tag_directions == NULL)
2748             ERROR_EXIT(REG_ESPACE);
2749           tnfa->tag_directions = tag_directions;
2750           memset(tag_directions, -1,
2751                  sizeof(*tag_directions) * (tnfa->num_tags + 1));
2752         }
2753       tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2754                                    sizeof(*tnfa->minimal_tags));
2755       if (tnfa->minimal_tags == NULL)
2756         ERROR_EXIT(REG_ESPACE);
2757
2758       submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2759                               sizeof(*submatch_data));
2760       if (submatch_data == NULL)
2761         ERROR_EXIT(REG_ESPACE);
2762       tnfa->submatch_data = submatch_data;
2763
2764       errcode = tre_add_tags(mem, stack, tree, tnfa);
2765       if (errcode != REG_OK)
2766         ERROR_EXIT(errcode);
2767
2768     }
2769
2770   /* Expand iteration nodes. */
2771   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2772                            tag_directions);
2773   if (errcode != REG_OK)
2774     ERROR_EXIT(errcode);
2775
2776   /* Add a dummy node for the final state.
2777      XXX - For certain patterns this dummy node can be optimized away,
2778            for example "a*" or "ab*".   Figure out a simple way to detect
2779            this possibility. */
2780   tmp_ast_l = tree;
2781   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2782   if (tmp_ast_r == NULL)
2783     ERROR_EXIT(REG_ESPACE);
2784
2785   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2786   if (tree == NULL)
2787     ERROR_EXIT(REG_ESPACE);
2788
2789   errcode = tre_compute_nfl(mem, stack, tree);
2790   if (errcode != REG_OK)
2791     ERROR_EXIT(errcode);
2792
2793   counts = xmalloc(sizeof(int) * parse_ctx.position);
2794   if (counts == NULL)
2795     ERROR_EXIT(REG_ESPACE);
2796
2797   offs = xmalloc(sizeof(int) * parse_ctx.position);
2798   if (offs == NULL)
2799     ERROR_EXIT(REG_ESPACE);
2800
2801   for (i = 0; i < parse_ctx.position; i++)
2802     counts[i] = 0;
2803   tre_ast_to_tnfa(tree, NULL, counts, NULL);
2804
2805   add = 0;
2806   for (i = 0; i < parse_ctx.position; i++)
2807     {
2808       offs[i] = add;
2809       add += counts[i] + 1;
2810       counts[i] = 0;
2811     }
2812   transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2813   if (transitions == NULL)
2814     ERROR_EXIT(REG_ESPACE);
2815   tnfa->transitions = transitions;
2816   tnfa->num_transitions = add;
2817
2818   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2819   if (errcode != REG_OK)
2820     ERROR_EXIT(errcode);
2821
2822   tnfa->firstpos_chars = NULL;
2823
2824   p = tree->firstpos;
2825   i = 0;
2826   while (p->position >= 0)
2827     {
2828       i++;
2829       p++;
2830     }
2831
2832   initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2833   if (initial == NULL)
2834     ERROR_EXIT(REG_ESPACE);
2835   tnfa->initial = initial;
2836
2837   i = 0;
2838   for (p = tree->firstpos; p->position >= 0; p++)
2839     {
2840       initial[i].state = transitions + offs[p->position];
2841       initial[i].state_id = p->position;
2842       initial[i].tags = NULL;
2843       /* Copy the arrays p->tags, and p->params, they are allocated
2844          from a tre_mem object. */
2845       if (p->tags)
2846         {
2847           int j;
2848           for (j = 0; p->tags[j] >= 0; j++);
2849           initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2850           if (!initial[i].tags)
2851             ERROR_EXIT(REG_ESPACE);
2852           memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2853         }
2854       initial[i].assertions = p->assertions;
2855       i++;
2856     }
2857   initial[i].state = NULL;
2858
2859   tnfa->num_transitions = add;
2860   tnfa->final = transitions + offs[tree->lastpos[0].position];
2861   tnfa->num_states = parse_ctx.position;
2862   tnfa->cflags = cflags;
2863
2864   tre_mem_destroy(mem);
2865   tre_stack_destroy(stack);
2866   xfree(counts);
2867   xfree(offs);
2868
2869   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2870   return REG_OK;
2871
2872  error_exit:
2873   /* Free everything that was allocated and return the error code. */
2874   tre_mem_destroy(mem);
2875   if (stack != NULL)
2876     tre_stack_destroy(stack);
2877   if (counts != NULL)
2878     xfree(counts);
2879   if (offs != NULL)
2880     xfree(offs);
2881   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2882   regfree(preg);
2883   return errcode;
2884 }
2885
2886
2887
2888
2889 void
2890 regfree(regex_t *preg)
2891 {
2892   tre_tnfa_t *tnfa;
2893   unsigned int i;
2894   tre_tnfa_transition_t *trans;
2895
2896   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2897   if (!tnfa)
2898     return;
2899
2900   for (i = 0; i < tnfa->num_transitions; i++)
2901     if (tnfa->transitions[i].state)
2902       {
2903         if (tnfa->transitions[i].tags)
2904           xfree(tnfa->transitions[i].tags);
2905         if (tnfa->transitions[i].neg_classes)
2906           xfree(tnfa->transitions[i].neg_classes);
2907       }
2908   if (tnfa->transitions)
2909     xfree(tnfa->transitions);
2910
2911   if (tnfa->initial)
2912     {
2913       for (trans = tnfa->initial; trans->state; trans++)
2914         {
2915           if (trans->tags)
2916             xfree(trans->tags);
2917         }
2918       xfree(tnfa->initial);
2919     }
2920
2921   if (tnfa->submatch_data)
2922     {
2923       for (i = 0; i < tnfa->num_submatches; i++)
2924         if (tnfa->submatch_data[i].parents)
2925           xfree(tnfa->submatch_data[i].parents);
2926       xfree(tnfa->submatch_data);
2927     }
2928
2929   if (tnfa->tag_directions)
2930     xfree(tnfa->tag_directions);
2931   if (tnfa->firstpos_chars)
2932     xfree(tnfa->firstpos_chars);
2933   if (tnfa->minimal_tags)
2934     xfree(tnfa->minimal_tags);
2935   xfree(tnfa);
2936 }