src/regex/regcomp.c

   1 /*
   2   regcomp.c - TRE POSIX compatible regex compilation functions.
   3
   4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   5   All rights reserved.
   6
   7   Redistribution and use in source and binary forms, with or without
   8   modification, are permitted provided that the following conditions
   9   are met:
  10
  11     1. Redistributions of source code must retain the above copyright
  12        notice, this list of conditions and the following disclaimer.
  13
  14     2. Redistributions in binary form must reproduce the above copyright
  15        notice, this list of conditions and the following disclaimer in the
  16        documentation and/or other materials provided with the distribution.
  17
  18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 */
  31
  32 #include <string.h>
  33 #include <errno.h>
  34 #include <stdlib.h>
  35 #include <regex.h>
  36 #include <limits.h>
  37 #include <stdint.h>
  38
  39 #include "tre.h"
  40
  41 #include <assert.h>
  42
  43 /***********************************************************************
  44  from tre-compile.h
  45 ***********************************************************************/
  46
  47 typedef struct {
  48   int position;
  49   int code_min;
  50   int code_max;
  51   int *tags;
  52   int assertions;
  53   tre_ctype_t class;
  54   tre_ctype_t *neg_classes;
  55   int backref;
  56   int *params;
  57 } tre_pos_and_tags_t;
  58
  59
  60 /***********************************************************************
  61  from tre-ast.c and tre-ast.h
  62 ***********************************************************************/
  63
  64 /* The different AST node types. */
  65 typedef enum {
  66   LITERAL,
  67   CATENATION,
  68   ITERATION,
  69   UNION
  70 } tre_ast_type_t;
  71
  72 /* Special subtypes of TRE_LITERAL. */
  73 #define EMPTY     -1   /* Empty leaf (denotes empty string). */
  74 #define ASSERTION -2   /* Assertion leaf. */
  75 #define TAG       -3   /* Tag leaf. */
  76 #define BACKREF   -4   /* Back reference leaf. */
  77
  78 #define IS_SPECIAL(x)   ((x)->code_min < 0)
  79 #define IS_EMPTY(x)     ((x)->code_min == EMPTY)
  80 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
  81 #define IS_TAG(x)       ((x)->code_min == TAG)
  82 #define IS_BACKREF(x)   ((x)->code_min == BACKREF)
  83
  84
  85 /* A generic AST node.  All AST nodes consist of this node on the top
  86    level with `obj' pointing to the actual content. */
  87 typedef struct {
  88   tre_ast_type_t type;   /* Type of the node. */
  89   void *obj;             /* Pointer to actual node. */
  90   int nullable;
  91   int submatch_id;
  92   int num_submatches;
  93   int num_tags;
  94   tre_pos_and_tags_t *firstpos;
  95   tre_pos_and_tags_t *lastpos;
  96 } tre_ast_node_t;
  97
  98
  99 /* A "literal" node.  These are created for assertions, back references,
 100    tags, matching parameter settings, and all expressions that match one
 101    character. */
 102 typedef struct {
 103   long code_min;
 104   long code_max;
 105   int position;
 106   union {
 107     tre_ctype_t class;
 108     int *params;
 109   } u;
 110   tre_ctype_t *neg_classes;
 111 } tre_literal_t;
 112
 113 /* A "catenation" node.  These are created when two regexps are concatenated.
 114    If there are more than one subexpressions in sequence, the `left' part
 115    holds all but the last, and `right' part holds the last subexpression
 116    (catenation is left associative). */
 117 typedef struct {
 118   tre_ast_node_t *left;
 119   tre_ast_node_t *right;
 120 } tre_catenation_t;
 121
 122 /* An "iteration" node.  These are created for the "*", "+", "?", and "{m,n}"
 123    operators. */
 124 typedef struct {
 125   /* Subexpression to match. */
 126   tre_ast_node_t *arg;
 127   /* Minimum number of consecutive matches. */
 128   int min;
 129   /* Maximum number of consecutive matches. */
 130   int max;
 131   /* If 0, match as many characters as possible, if 1 match as few as
 132      possible.  Note that this does not always mean the same thing as
 133      matching as many/few repetitions as possible. */
 134   unsigned int minimal:1;
 135 } tre_iteration_t;
 136
 137 /* An "union" node.  These are created for the "|" operator. */
 138 typedef struct {
 139   tre_ast_node_t *left;
 140   tre_ast_node_t *right;
 141 } tre_union_t;
 142
 143 static tre_ast_node_t *
 144 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
 145
 146 static tre_ast_node_t *
 147 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
 148
 149 static tre_ast_node_t *
 150 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
 151                  int minimal);
 152
 153 static tre_ast_node_t *
 154 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
 155
 156 static tre_ast_node_t *
 157 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
 158                        tre_ast_node_t *right);
 159
 160
 161 static tre_ast_node_t *
 162 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
 163 {
 164   tre_ast_node_t *node;
 165
 166   node = tre_mem_calloc(mem, sizeof(*node));
 167   if (!node)
 168     return NULL;
 169   node->obj = tre_mem_calloc(mem, size);
 170   if (!node->obj)
 171     return NULL;
 172   node->type = type;
 173   node->nullable = -1;
 174   node->submatch_id = -1;
 175
 176   return node;
 177 }
 178
 179 static tre_ast_node_t *
 180 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
 181 {
 182   tre_ast_node_t *node;
 183   tre_literal_t *lit;
 184
 185   node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
 186   if (!node)
 187     return NULL;
 188   lit = node->obj;
 189   lit->code_min = code_min;
 190   lit->code_max = code_max;
 191   lit->position = position;
 192
 193   return node;
 194 }
 195
 196 static tre_ast_node_t *
 197 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
 198                  int minimal)
 199 {
 200   tre_ast_node_t *node;
 201   tre_iteration_t *iter;
 202
 203   node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
 204   if (!node)
 205     return NULL;
 206   iter = node->obj;
 207   iter->arg = arg;
 208   iter->min = min;
 209   iter->max = max;
 210   iter->minimal = minimal;
 211   node->num_submatches = arg->num_submatches;
 212
 213   return node;
 214 }
 215
 216 static tre_ast_node_t *
 217 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 218 {
 219   tre_ast_node_t *node;
 220
 221   node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
 222   if (node == NULL)
 223     return NULL;
 224   ((tre_union_t *)node->obj)->left = left;
 225   ((tre_union_t *)node->obj)->right = right;
 226   node->num_submatches = left->num_submatches + right->num_submatches;
 227
 228   return node;
 229 }
 230
 231 static tre_ast_node_t *
 232 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
 233                        tre_ast_node_t *right)
 234 {
 235   tre_ast_node_t *node;
 236
 237   node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
 238   if (node == NULL)
 239     return NULL;
 240   ((tre_catenation_t *)node->obj)->left = left;
 241   ((tre_catenation_t *)node->obj)->right = right;
 242   node->num_submatches = left->num_submatches + right->num_submatches;
 243
 244   return node;
 245 }
 246
 247
 248 /***********************************************************************
 249  from tre-stack.c and tre-stack.h
 250 ***********************************************************************/
 251
 252 typedef struct tre_stack_rec tre_stack_t;
 253
 254 /* Creates a new stack object.  `size' is initial size in bytes, `max_size'
 255    is maximum size, and `increment' specifies how much more space will be
 256    allocated with realloc() if all space gets used up.  Returns the stack
 257    object or NULL if out of memory. */
 258 static tre_stack_t *
 259 tre_stack_new(int size, int max_size, int increment);
 260
 261 /* Frees the stack object. */
 262 static void
 263 tre_stack_destroy(tre_stack_t *s);
 264
 265 /* Returns the current number of objects in the stack. */
 266 static int
 267 tre_stack_num_objects(tre_stack_t *s);
 268
 269 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
 270    `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
 271    This tries to realloc() more space before failing if maximum size
 272    has not yet been reached.  Returns REG_OK if successful. */
 273 #define declare_pushf(typetag, type)                                          \
 274   static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
 275
 276 declare_pushf(voidptr, void *);
 277 declare_pushf(int, int);
 278
 279 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
 280    element off of stack `s' and returns it.  The stack must not be
 281    empty. */
 282 #define declare_popf(typetag, type)               \
 283   static type tre_stack_pop_ ## typetag(tre_stack_t *s)
 284
 285 declare_popf(voidptr, void *);
 286 declare_popf(int, int);
 287
 288 /* Just to save some typing. */
 289 #define STACK_PUSH(s, typetag, value)                                         \
 290   do                                                                          \
 291     {                                                                         \
 292       status = tre_stack_push_ ## typetag(s, value);                          \
 293     }                                                                         \
 294   while (/*CONSTCOND*/0)
 295
 296 #define STACK_PUSHX(s, typetag, value)                                        \
 297   {                                                                           \
 298     status = tre_stack_push_ ## typetag(s, value);                            \
 299     if (status != REG_OK)                                                     \
 300       break;                                                                  \
 301   }
 302
 303 #define STACK_PUSHR(s, typetag, value)                                        \
 304   {                                                                           \
 305     reg_errcode_t _status;                                                    \
 306     _status = tre_stack_push_ ## typetag(s, value);                           \
 307     if (_status != REG_OK)                                                    \
 308       return _status;                                                         \
 309   }
 310
 311 union tre_stack_item {
 312   void *voidptr_value;
 313   int int_value;
 314 };
 315
 316 struct tre_stack_rec {
 317   int size;
 318   int max_size;
 319   int increment;
 320   int ptr;
 321   union tre_stack_item *stack;
 322 };
 323
 324
 325 static tre_stack_t *
 326 tre_stack_new(int size, int max_size, int increment)
 327 {
 328   tre_stack_t *s;
 329
 330   s = xmalloc(sizeof(*s));
 331   if (s != NULL)
 332     {
 333       s->stack = xmalloc(sizeof(*s->stack) * size);
 334       if (s->stack == NULL)
 335         {
 336           xfree(s);
 337           return NULL;
 338         }
 339       s->size = size;
 340       s->max_size = max_size;
 341       s->increment = increment;
 342       s->ptr = 0;
 343     }
 344   return s;
 345 }
 346
 347 static void
 348 tre_stack_destroy(tre_stack_t *s)
 349 {
 350   xfree(s->stack);
 351   xfree(s);
 352 }
 353
 354 static int
 355 tre_stack_num_objects(tre_stack_t *s)
 356 {
 357   return s->ptr;
 358 }
 359
 360 static reg_errcode_t
 361 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
 362 {
 363   if (s->ptr < s->size)
 364     {
 365       s->stack[s->ptr] = value;
 366       s->ptr++;
 367     }
 368   else
 369     {
 370       if (s->size >= s->max_size)
 371         {
 372           return REG_ESPACE;
 373         }
 374       else
 375         {
 376           union tre_stack_item *new_buffer;
 377           int new_size;
 378           new_size = s->size + s->increment;
 379           if (new_size > s->max_size)
 380             new_size = s->max_size;
 381           new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
 382           if (new_buffer == NULL)
 383             {
 384               return REG_ESPACE;
 385             }
 386           assert(new_size > s->size);
 387           s->size = new_size;
 388           s->stack = new_buffer;
 389           tre_stack_push(s, value);
 390         }
 391     }
 392   return REG_OK;
 393 }
 394
 395 #define define_pushf(typetag, type)  \
 396   declare_pushf(typetag, type) {     \
 397     union tre_stack_item item;       \
 398     item.typetag ## _value = value;  \
 399     return tre_stack_push(s, item);  \
 400 }
 401
 402 define_pushf(int, int)
 403 define_pushf(voidptr, void *)
 404
 405 #define define_popf(typetag, type)                  \
 406   declare_popf(typetag, type) {                     \
 407     return s->stack[--s->ptr].typetag ## _value;    \
 408   }
 409
 410 define_popf(int, int)
 411 define_popf(voidptr, void *)
 412
 413
 414 /***********************************************************************
 415  from tre-parse.c and tre-parse.h
 416 ***********************************************************************/
 417
 418 /* Parse context. */
 419 typedef struct {
 420   /* Memory allocator.  The AST is allocated using this. */
 421   tre_mem_t mem;
 422   /* Stack used for keeping track of regexp syntax. */
 423   tre_stack_t *stack;
 424   /* The parse result. */
 425   tre_ast_node_t *result;
 426   /* The regexp to parse and its length. */
 427   const char *re;
 428   /* The first character of the entire regexp. */
 429   const char *re_start;
 430   /* Current submatch ID. */
 431   int submatch_id;
 432   /* Current position (number of literal). */
 433   int position;
 434   /* The highest back reference or -1 if none seen so far. */
 435   int max_backref;
 436   /* This flag is set if the regexp uses approximate matching. */
 437   int have_approx;
 438   /* Compilation flags. */
 439   int cflags;
 440   /* If this flag is set the top-level submatch is not captured. */
 441   int nofirstsub;
 442 } tre_parse_ctx_t;
 443
 444 /* Parses a wide character regexp pattern into a syntax tree.  This parser
 445    handles both syntaxes (BRE and ERE), including the TRE extensions. */
 446 static reg_errcode_t
 447 tre_parse(tre_parse_ctx_t *ctx);
 448
 449
 450 /*
 451   This parser is just a simple recursive descent parser for POSIX.2
 452   regexps.  The parser supports both the obsolete default syntax and
 453   the "extended" syntax, and some nonstandard extensions.
 454 */
 455
 456 /* Characters with special meanings in regexp syntax. */
 457 #define CHAR_PIPE          '|'
 458 #define CHAR_LPAREN        '('
 459 #define CHAR_RPAREN        ')'
 460 #define CHAR_LBRACE        '{'
 461 #define CHAR_RBRACE        '}'
 462 #define CHAR_LBRACKET      '['
 463 #define CHAR_RBRACKET      ']'
 464 #define CHAR_MINUS         '-'
 465 #define CHAR_STAR          '*'
 466 #define CHAR_QUESTIONMARK  '?'
 467 #define CHAR_PLUS          '+'
 468 #define CHAR_PERIOD        '.'
 469 #define CHAR_COLON         ':'
 470 #define CHAR_EQUAL         '='
 471 #define CHAR_COMMA         ','
 472 #define CHAR_CARET         '^'
 473 #define CHAR_DOLLAR        '$'
 474 #define CHAR_BACKSLASH     '\\'
 475 #define CHAR_HASH          '#'
 476 #define CHAR_TILDE         '~'
 477
 478
 479 /* Some macros for expanding \w, \s, etc. */
 480 static const struct tre_macro_struct {
 481   const char c;
 482   const char *expansion;
 483 } tre_macros[] =
 484   { {'t', "\t"},           {'n', "\n"},            {'r', "\r"},
 485     {'f', "\f"},           {'a', "\a"},            {'e', "\033"},
 486     {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
 487     {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"},   {'D', "[^[:digit:]]"},
 488     { 0, NULL }
 489   };
 490
 491
 492 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
 493    must have at least `len' items.  Sets buf[0] to zero if the there
 494    is no match in `tre_macros'. */
 495 static const char *
 496 tre_expand_macro(const char *regex)
 497 {
 498   int i;
 499
 500   if (!*regex)
 501     return 0;
 502
 503   for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
 504   return tre_macros[i].expansion;
 505 }
 506
 507 static reg_errcode_t
 508 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
 509          tre_ast_node_t ***items)
 510 {
 511   reg_errcode_t status;
 512   tre_ast_node_t **array = *items;
 513   /* Allocate more space if necessary. */
 514   if (*i >= *max_i)
 515     {
 516       tre_ast_node_t **new_items;
 517       /* If the array is already 1024 items large, give up -- there's
 518          probably an error in the regexp (e.g. not a '\0' terminated
 519          string and missing ']') */
 520       if (*max_i > 1024)
 521         return REG_ESPACE;
 522       *max_i *= 2;
 523       new_items = xrealloc(array, sizeof(*items) * *max_i);
 524       if (new_items == NULL)
 525         return REG_ESPACE;
 526       *items = array = new_items;
 527     }
 528   array[*i] = tre_ast_new_literal(mem, min, max, -1);
 529   status = array[*i] == NULL ? REG_ESPACE : REG_OK;
 530   (*i)++;
 531   return status;
 532 }
 533
 534
 535 static int
 536 tre_compare_items(const void *a, const void *b)
 537 {
 538   const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
 539   const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
 540   tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
 541   int a_min = l_a->code_min, b_min = l_b->code_min;
 542
 543   if (a_min < b_min)
 544     return -1;
 545   else if (a_min > b_min)
 546     return 1;
 547   else
 548     return 0;
 549 }
 550
 551 /* Maximum number of character classes that can occur in a negated bracket
 552    expression.  */
 553 #define MAX_NEG_CLASSES 64
 554
 555 /* Maximum length of character class names. */
 556 #define MAX_CLASS_NAME
 557
 558 static reg_errcode_t
 559 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
 560                         tre_ctype_t neg_classes[], int *num_neg_classes,
 561                         tre_ast_node_t ***items, int *num_items,
 562                         int *items_size)
 563 {
 564   const char *re = ctx->re;
 565   reg_errcode_t status = REG_OK;
 566   tre_ctype_t class = (tre_ctype_t)0;
 567   int i = *num_items;
 568   int max_i = *items_size;
 569   int skip;
 570
 571   /* Build an array of the items in the bracket expression. */
 572   while (status == REG_OK)
 573     {
 574       skip = 0;
 575       if (!*re)
 576         {
 577           status = REG_EBRACK;
 578         }
 579       else if (*re == CHAR_RBRACKET && re > ctx->re)
 580         {
 581           re++;
 582           break;
 583         }
 584       else
 585         {
 586           tre_cint_t min = 0, max = 0;
 587           wchar_t wc;
 588           int clen = mbtowc(&wc, re, -1);
 589
 590           if (clen<0) clen=1, wc=WEOF;
 591
 592           class = (tre_ctype_t)0;
 593           if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
 594             {
 595               min = wc;
 596               re += clen+1;
 597               clen = mbtowc(&wc, re, -1);
 598               if (clen<0) clen=1, wc=WEOF;
 599               max = wc;
 600               re += clen;
 601               /* XXX - Should use collation order instead of encoding values
 602                  in character ranges. */
 603               if (min > max)
 604                 status = REG_ERANGE;
 605             }
 606           else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
 607             status = REG_ECOLLATE;
 608           else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
 609             status = REG_ECOLLATE;
 610           else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
 611             {
 612               char tmp_str[64];
 613               const char *endptr = re + 2;
 614               int len;
 615               while (*endptr && *endptr != CHAR_COLON)
 616                 endptr++;
 617               if (*endptr)
 618                 {
 619                   len = MIN(endptr - re - 2, 63);
 620                   strncpy(tmp_str, re + 2, len);
 621                   tmp_str[len] = '\0';
 622                   class = tre_ctype(tmp_str);
 623                   if (!class)
 624                     status = REG_ECTYPE;
 625                   re = endptr + 2;
 626                 }
 627               else
 628                 status = REG_ECTYPE;
 629               min = 0;
 630               max = TRE_CHAR_MAX;
 631             }
 632           else
 633             {
 634               if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
 635                   && ctx->re != re)
 636                 /* Two ranges are not allowed to share and endpoint. */
 637                 status = REG_ERANGE;
 638               min = max = wc;
 639               re += clen;
 640             }
 641
 642           if (status != REG_OK)
 643             break;
 644
 645           if (class && negate)
 646             if (*num_neg_classes >= MAX_NEG_CLASSES)
 647               status = REG_ESPACE;
 648             else
 649               neg_classes[(*num_neg_classes)++] = class;
 650           else if (!skip)
 651             {
 652               status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
 653               if (status != REG_OK)
 654                 break;
 655               ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
 656             }
 657
 658           /* Add opposite-case counterpoints if REG_ICASE is present.
 659              This is broken if there are more than two "same" characters. */
 660           if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
 661             {
 662               tre_cint_t cmin, ccurr;
 663
 664               while (min <= max)
 665                 {
 666                   if (tre_islower(min))
 667                     {
 668                       cmin = ccurr = tre_toupper(min++);
 669                       while (tre_islower(min) && tre_toupper(min) == ccurr + 1
 670                              && min <= max)
 671                         ccurr = tre_toupper(min++);
 672                       status = tre_new_item(ctx->mem, cmin, ccurr,
 673                                             &i, &max_i, items);
 674                     }
 675                   else if (tre_isupper(min))
 676                     {
 677                       cmin = ccurr = tre_tolower(min++);
 678                       while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
 679                              && min <= max)
 680                         ccurr = tre_tolower(min++);
 681                       status = tre_new_item(ctx->mem, cmin, ccurr,
 682                                             &i, &max_i, items);
 683                     }
 684                   else min++;
 685                   if (status != REG_OK)
 686                     break;
 687                 }
 688               if (status != REG_OK)
 689                 break;
 690             }
 691         }
 692     }
 693   *num_items = i;
 694   *items_size = max_i;
 695   ctx->re = re;
 696   return status;
 697 }
 698
 699 static reg_errcode_t
 700 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
 701 {
 702   tre_ast_node_t *node = NULL;
 703   int negate = 0;
 704   reg_errcode_t status = REG_OK;
 705   tre_ast_node_t **items, *u, *n;
 706   int i = 0, j, max_i = 32, curr_max, curr_min;
 707   tre_ctype_t neg_classes[MAX_NEG_CLASSES];
 708   int num_neg_classes = 0;
 709
 710   /* Start off with an array of `max_i' elements. */
 711   items = xmalloc(sizeof(*items) * max_i);
 712   if (items == NULL)
 713     return REG_ESPACE;
 714
 715   if (*ctx->re == CHAR_CARET)
 716     {
 717       negate = 1;
 718       ctx->re++;
 719     }
 720
 721   status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
 722                                    &items, &i, &max_i);
 723
 724   if (status != REG_OK)
 725     goto parse_bracket_done;
 726
 727   /* Sort the array if we need to negate it. */
 728   if (negate)
 729     qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
 730
 731   curr_max = curr_min = 0;
 732   /* Build a union of the items in the array, negated if necessary. */
 733   for (j = 0; j < i && status == REG_OK; j++)
 734     {
 735       int min, max;
 736       tre_literal_t *l = items[j]->obj;
 737       min = l->code_min;
 738       max = l->code_max;
 739
 740       if (negate)
 741         {
 742           if (min < curr_max)
 743             {
 744               /* Overlap. */
 745               curr_max = MAX(max + 1, curr_max);
 746               l = NULL;
 747             }
 748           else
 749             {
 750               /* No overlap. */
 751               curr_max = min - 1;
 752               if (curr_max >= curr_min)
 753                 {
 754                   l->code_min = curr_min;
 755                   l->code_max = curr_max;
 756                 }
 757               else
 758                 {
 759                   l = NULL;
 760                 }
 761               curr_min = curr_max = max + 1;
 762             }
 763         }
 764
 765       if (l != NULL)
 766         {
 767           int k;
 768           l->position = ctx->position;
 769           if (num_neg_classes > 0)
 770             {
 771               l->neg_classes = tre_mem_alloc(ctx->mem,
 772                                              (sizeof(l->neg_classes)
 773                                               * (num_neg_classes + 1)));
 774               if (l->neg_classes == NULL)
 775                 {
 776                   status = REG_ESPACE;
 777                   break;
 778                 }
 779               for (k = 0; k < num_neg_classes; k++)
 780                 l->neg_classes[k] = neg_classes[k];
 781               l->neg_classes[k] = (tre_ctype_t)0;
 782             }
 783           else
 784             l->neg_classes = NULL;
 785           if (node == NULL)
 786             node = items[j];
 787           else
 788             {
 789               u = tre_ast_new_union(ctx->mem, node, items[j]);
 790               if (u == NULL)
 791                 status = REG_ESPACE;
 792               node = u;
 793             }
 794         }
 795     }
 796
 797   if (status != REG_OK)
 798     goto parse_bracket_done;
 799
 800   if (negate)
 801     {
 802       int k;
 803       n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
 804       if (n == NULL)
 805         status = REG_ESPACE;
 806       else
 807         {
 808           tre_literal_t *l = n->obj;
 809           if (num_neg_classes > 0)
 810             {
 811               l->neg_classes = tre_mem_alloc(ctx->mem,
 812                                              (sizeof(l->neg_classes)
 813                                               * (num_neg_classes + 1)));
 814               if (l->neg_classes == NULL)
 815                 {
 816                   status = REG_ESPACE;
 817                   goto parse_bracket_done;
 818                 }
 819               for (k = 0; k < num_neg_classes; k++)
 820                 l->neg_classes[k] = neg_classes[k];
 821               l->neg_classes[k] = (tre_ctype_t)0;
 822             }
 823           else
 824             l->neg_classes = NULL;
 825           if (node == NULL)
 826             node = n;
 827           else
 828             {
 829               u = tre_ast_new_union(ctx->mem, node, n);
 830               if (u == NULL)
 831                 status = REG_ESPACE;
 832               node = u;
 833             }
 834         }
 835     }
 836
 837   if (status != REG_OK)
 838     goto parse_bracket_done;
 839
 840 #ifdef TRE_DEBUG
 841   tre_ast_print(node);
 842 #endif /* TRE_DEBUG */
 843
 844  parse_bracket_done:
 845   xfree(items);
 846   ctx->position++;
 847   *result = node;
 848   return status;
 849 }
 850
 851
 852 /* Parses a positive decimal integer.  Returns -1 if the string does not
 853    contain a valid number. */
 854 static int
 855 tre_parse_int(const char **regex)
 856 {
 857   int num = -1;
 858   const char *r = *regex;
 859   while (*r-'0'<10U)
 860     {
 861       if (num < 0)
 862         num = 0;
 863       num = num * 10 + *r - '0';
 864       r++;
 865     }
 866   *regex = r;
 867   return num;
 868 }
 869
 870
 871 static reg_errcode_t
 872 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
 873 {
 874   int min, max;
 875   const char *r = ctx->re;
 876   int minimal = 0;
 877
 878   /* Parse number (minimum repetition count). */
 879   min = -1;
 880   if (*r >= '0' && *r <= '9') {
 881     min = tre_parse_int(&r);
 882   }
 883
 884   /* Parse comma and second number (maximum repetition count). */
 885   max = min;
 886   if (*r == CHAR_COMMA)
 887     {
 888       r++;
 889       max = tre_parse_int(&r);
 890     }
 891
 892   /* Check that the repeat counts are sane. */
 893   if ((max >= 0 && min > max) || max > RE_DUP_MAX)
 894     return REG_BADBR;
 895
 896   /* Missing }. */
 897   if (!*r)
 898     return REG_EBRACE;
 899
 900   /* Empty contents of {}. */
 901   if (r == ctx->re)
 902     return REG_BADBR;
 903
 904   /* Parse the ending '}' or '\}'.*/
 905   if (ctx->cflags & REG_EXTENDED)
 906     {
 907       if (*r != CHAR_RBRACE)
 908         return REG_BADBR;
 909       r++;
 910     }
 911   else
 912     {
 913       if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
 914         return REG_BADBR;
 915       r += 2;
 916     }
 917
 918   /* Create the AST node(s). */
 919   if (min == 0 && max == 0)
 920     {
 921       *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 922       if (*result == NULL)
 923         return REG_ESPACE;
 924     }
 925   else
 926     {
 927       if (min < 0 && max < 0)
 928         /* Only approximate parameters set, no repetitions. */
 929         min = max = 1;
 930
 931       *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
 932       if (!*result)
 933         return REG_ESPACE;
 934     }
 935
 936   ctx->re = r;
 937   return REG_OK;
 938 }
 939
 940 typedef enum {
 941   PARSE_RE = 0,
 942   PARSE_ATOM,
 943   PARSE_MARK_FOR_SUBMATCH,
 944   PARSE_BRANCH,
 945   PARSE_PIECE,
 946   PARSE_CATENATION,
 947   PARSE_POST_CATENATION,
 948   PARSE_UNION,
 949   PARSE_POST_UNION,
 950   PARSE_POSTFIX,
 951   PARSE_RESTORE_CFLAGS
 952 } tre_parse_re_stack_symbol_t;
 953
 954
 955 static reg_errcode_t
 956 tre_parse(tre_parse_ctx_t *ctx)
 957 {
 958   tre_ast_node_t *result = NULL;
 959   tre_parse_re_stack_symbol_t symbol;
 960   reg_errcode_t status = REG_OK;
 961   tre_stack_t *stack = ctx->stack;
 962   int bottom = tre_stack_num_objects(stack);
 963   int depth = 0;
 964
 965   if (!ctx->nofirstsub)
 966     {
 967       STACK_PUSH(stack, int, ctx->submatch_id);
 968       STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
 969       ctx->submatch_id++;
 970     }
 971   STACK_PUSH(stack, int, PARSE_RE);
 972   ctx->re_start = ctx->re;
 973
 974
 975   /* The following is basically just a recursive descent parser.  I use
 976      an explicit stack instead of recursive functions mostly because of
 977      two reasons: compatibility with systems which have an overflowable
 978      call stack, and efficiency (both in lines of code and speed).  */
 979   while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
 980     {
 981       if (status != REG_OK)
 982         break;
 983       symbol = tre_stack_pop_int(stack);
 984       switch (symbol)
 985         {
 986         case PARSE_RE:
 987           /* Parse a full regexp.  A regexp is one or more branches,
 988              separated by the union operator `|'. */
 989           if (ctx->cflags & REG_EXTENDED)
 990             STACK_PUSHX(stack, int, PARSE_UNION);
 991           STACK_PUSHX(stack, int, PARSE_BRANCH);
 992           break;
 993
 994         case PARSE_BRANCH:
 995           /* Parse a branch.  A branch is one or more pieces, concatenated.
 996              A piece is an atom possibly followed by a postfix operator. */
 997           STACK_PUSHX(stack, int, PARSE_CATENATION);
 998           STACK_PUSHX(stack, int, PARSE_PIECE);
 999           break;
1000
1001         case PARSE_PIECE:
1002           /* Parse a piece.  A piece is an atom possibly followed by one
1003              or more postfix operators. */
1004             STACK_PUSHX(stack, int, PARSE_POSTFIX);
1005           STACK_PUSHX(stack, int, PARSE_ATOM);
1006           break;
1007
1008         case PARSE_CATENATION:
1009           /* If the expression has not ended, parse another piece. */
1010           {
1011             tre_char_t c;
1012             if (!*ctx->re)
1013               break;
1014             c = *ctx->re;
1015                 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1016                   break;
1017                 if ((ctx->cflags & REG_EXTENDED
1018                      && c == CHAR_RPAREN && depth > 0)
1019                     || (!(ctx->cflags & REG_EXTENDED)
1020                         && (c == CHAR_BACKSLASH
1021                             && *(ctx->re + 1) == CHAR_RPAREN)))
1022                   {
1023                     if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1024                       status = REG_EPAREN;
1025                     depth--;
1026                     if (!(ctx->cflags & REG_EXTENDED))
1027                       ctx->re += 2;
1028                     break;
1029                   }
1030
1031               {
1032                 /* Default case, left associative concatenation. */
1033                 STACK_PUSHX(stack, int, PARSE_CATENATION);
1034                 STACK_PUSHX(stack, voidptr, result);
1035                 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1036                 STACK_PUSHX(stack, int, PARSE_PIECE);
1037               }
1038             break;
1039           }
1040
1041         case PARSE_POST_CATENATION:
1042           {
1043             tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1044             tre_ast_node_t *tmp_node;
1045             tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1046             if (!tmp_node)
1047               return REG_ESPACE;
1048             result = tmp_node;
1049             break;
1050           }
1051
1052         case PARSE_UNION:
1053           if (!*ctx->re)
1054             break;
1055           switch (*ctx->re)
1056             {
1057             case CHAR_PIPE:
1058               STACK_PUSHX(stack, int, PARSE_UNION);
1059               STACK_PUSHX(stack, voidptr, result);
1060               STACK_PUSHX(stack, int, PARSE_POST_UNION);
1061               STACK_PUSHX(stack, int, PARSE_BRANCH);
1062               ctx->re++;
1063               break;
1064
1065             case CHAR_RPAREN:
1066               ctx->re++;
1067               break;
1068
1069             default:
1070               break;
1071             }
1072           break;
1073
1074         case PARSE_POST_UNION:
1075           {
1076             tre_ast_node_t *tmp_node;
1077             tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1078             tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1079             if (!tmp_node)
1080               return REG_ESPACE;
1081             result = tmp_node;
1082             break;
1083           }
1084
1085         case PARSE_POSTFIX:
1086           /* Parse postfix operators. */
1087           if (!*ctx->re)
1088             break;
1089           switch (*ctx->re)
1090             {
1091             case CHAR_PLUS:
1092             case CHAR_QUESTIONMARK:
1093               if (!(ctx->cflags & REG_EXTENDED))
1094                 break;
1095                 /*FALLTHROUGH*/
1096             case CHAR_STAR:
1097               {
1098                 tre_ast_node_t *tmp_node;
1099                 int minimal = 0;
1100                 int rep_min = 0;
1101                 int rep_max = -1;
1102
1103                 if (*ctx->re == CHAR_PLUS)
1104                   rep_min = 1;
1105                 if (*ctx->re == CHAR_QUESTIONMARK)
1106                   rep_max = 1;
1107
1108                 ctx->re++;
1109                 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1110                                             minimal);
1111                 if (tmp_node == NULL)
1112                   return REG_ESPACE;
1113                 result = tmp_node;
1114                 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1115               }
1116               break;
1117
1118             case CHAR_BACKSLASH:
1119               /* "\{" is special without REG_EXTENDED */
1120               if (!(ctx->cflags & REG_EXTENDED)
1121                   && *(ctx->re + 1) == CHAR_LBRACE)
1122                 {
1123                   ctx->re++;
1124                   goto parse_brace;
1125                 }
1126               else
1127                 break;
1128
1129             case CHAR_LBRACE:
1130               /* "{" is literal without REG_EXTENDED */
1131               if (!(ctx->cflags & REG_EXTENDED))
1132                 break;
1133
1134             parse_brace:
1135               ctx->re++;
1136
1137               status = tre_parse_bound(ctx, &result);
1138               if (status != REG_OK)
1139                 return status;
1140               STACK_PUSHX(stack, int, PARSE_POSTFIX);
1141               break;
1142             }
1143           break;
1144
1145         case PARSE_ATOM:
1146           /* Parse an atom.  An atom is a regular expression enclosed in `()',
1147              an empty set of `()', a bracket expression, `.', `^', `$',
1148              a `\' followed by a character, or a single character. */
1149
1150           /* End of regexp? (empty string). */
1151           if (!*ctx->re)
1152             goto parse_literal;
1153
1154           switch (*ctx->re)
1155             {
1156             case CHAR_LPAREN:  /* parenthesized subexpression */
1157
1158               if (ctx->cflags & REG_EXTENDED
1159                   || (ctx->re > ctx->re_start
1160                       && *(ctx->re - 1) == CHAR_BACKSLASH))
1161                 {
1162                   depth++;
1163                     {
1164                       ctx->re++;
1165                       /* First parse a whole RE, then mark the resulting tree
1166                          for submatching. */
1167                       STACK_PUSHX(stack, int, ctx->submatch_id);
1168                       STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1169                       STACK_PUSHX(stack, int, PARSE_RE);
1170                       ctx->submatch_id++;
1171                     }
1172                 }
1173               else
1174                 goto parse_literal;
1175               break;
1176
1177             case CHAR_RPAREN:  /* end of current subexpression */
1178               if ((ctx->cflags & REG_EXTENDED && depth > 0)
1179                   || (ctx->re > ctx->re_start
1180                       && *(ctx->re - 1) == CHAR_BACKSLASH))
1181                 {
1182                   /* We were expecting an atom, but instead the current
1183                      subexpression was closed.  POSIX leaves the meaning of
1184                      this to be implementation-defined.  We interpret this as
1185                      an empty expression (which matches an empty string).  */
1186                   result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1187                   if (result == NULL)
1188                     return REG_ESPACE;
1189                   if (!(ctx->cflags & REG_EXTENDED))
1190                     ctx->re--;
1191                 }
1192               else
1193                 goto parse_literal;
1194               break;
1195
1196             case CHAR_LBRACKET: /* bracket expression */
1197               ctx->re++;
1198               status = tre_parse_bracket(ctx, &result);
1199               if (status != REG_OK)
1200                 return status;
1201               break;
1202
1203             case CHAR_BACKSLASH:
1204               /* If this is "\(" or "\)" chew off the backslash and
1205                  try again. */
1206               if (!(ctx->cflags & REG_EXTENDED)
1207                   && (*(ctx->re + 1) == CHAR_LPAREN
1208                       || *(ctx->re + 1) == CHAR_RPAREN))
1209                 {
1210                   ctx->re++;
1211                   STACK_PUSHX(stack, int, PARSE_ATOM);
1212                   break;
1213                 }
1214
1215               /* If a macro is used, parse the expanded macro recursively. */
1216               {
1217                 const char *buf = tre_expand_macro(ctx->re + 1);
1218                 if (buf)
1219                   {
1220                     tre_parse_ctx_t subctx;
1221                     memcpy(&subctx, ctx, sizeof(subctx));
1222                     subctx.re = buf;
1223                     subctx.nofirstsub = 1;
1224                     status = tre_parse(&subctx);
1225                     if (status != REG_OK)
1226                       return status;
1227                     ctx->re += 2;
1228                     ctx->position = subctx.position;
1229                     result = subctx.result;
1230                     break;
1231                   }
1232               }
1233
1234               if (!*ctx->re)
1235                 /* Trailing backslash. */
1236                 return REG_EESCAPE;
1237
1238               ctx->re++;
1239               switch (*ctx->re)
1240                 {
1241                 case 'b':
1242                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
1243                                                ASSERT_AT_WB, -1);
1244                   ctx->re++;
1245                   break;
1246                 case 'B':
1247                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
1248                                                ASSERT_AT_WB_NEG, -1);
1249                   ctx->re++;
1250                   break;
1251                 case '<':
1252                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
1253                                                ASSERT_AT_BOW, -1);
1254                   ctx->re++;
1255                   break;
1256                 case '>':
1257                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
1258                                                ASSERT_AT_EOW, -1);
1259                   ctx->re++;
1260                   break;
1261                 case 'x':
1262                   ctx->re++;
1263                   if (ctx->re[0] != CHAR_LBRACE)
1264                     {
1265                       /* 8 bit hex char. */
1266                       char tmp[3] = {0, 0, 0};
1267                       long val;
1268
1269                       if (tre_isxdigit(ctx->re[0]))
1270                         {
1271                           tmp[0] = (char)ctx->re[0];
1272                           ctx->re++;
1273                         }
1274                       if (tre_isxdigit(ctx->re[0]))
1275                         {
1276                           tmp[1] = (char)ctx->re[0];
1277                           ctx->re++;
1278                         }
1279                       val = strtol(tmp, NULL, 16);
1280                       result = tre_ast_new_literal(ctx->mem, (int)val,
1281                                                    (int)val, ctx->position);
1282                       ctx->position++;
1283                       break;
1284                     }
1285                   else if (*ctx->re)
1286                     {
1287                       /* Wide char. */
1288                       char tmp[32];
1289                       long val;
1290                       int i = 0;
1291                       ctx->re++;
1292                       while (*ctx->re && i < sizeof tmp)
1293                         {
1294                           if (ctx->re[0] == CHAR_RBRACE)
1295                             break;
1296                           if (tre_isxdigit(ctx->re[0]))
1297                             {
1298                               tmp[i] = (char)ctx->re[0];
1299                               i++;
1300                               ctx->re++;
1301                               continue;
1302                             }
1303                           return REG_EBRACE;
1304                         }
1305                       ctx->re++;
1306                       tmp[i] = 0;
1307                       val = strtol(tmp, NULL, 16);
1308                       result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1309                                                    ctx->position);
1310                       ctx->position++;
1311                       break;
1312                     }
1313                   /*FALLTHROUGH*/
1314
1315                 default:
1316                   if (tre_isdigit(*ctx->re))
1317                     {
1318                       /* Back reference. */
1319                       int val = *ctx->re - '0';
1320                       result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1321                                                    ctx->position);
1322                       if (result == NULL)
1323                         return REG_ESPACE;
1324                       ctx->position++;
1325                       ctx->max_backref = MAX(val, ctx->max_backref);
1326                       ctx->re++;
1327                     }
1328                   else
1329                     {
1330                       /* Escaped character. */
1331                       result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1332                                                    ctx->position);
1333                       ctx->position++;
1334                       ctx->re++;
1335                     }
1336                   break;
1337                 }
1338               if (result == NULL)
1339                 return REG_ESPACE;
1340               break;
1341
1342             case CHAR_PERIOD:    /* the any-symbol */
1343               if (ctx->cflags & REG_NEWLINE)
1344                 {
1345                   tre_ast_node_t *tmp1;
1346                   tre_ast_node_t *tmp2;
1347                   tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
1348                                              ctx->position);
1349                   if (!tmp1)
1350                     return REG_ESPACE;
1351                   tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
1352                                              ctx->position + 1);
1353                   if (!tmp2)
1354                     return REG_ESPACE;
1355                   result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1356                   if (!result)
1357                     return REG_ESPACE;
1358                   ctx->position += 2;
1359                 }
1360               else
1361                 {
1362                   result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1363                                                ctx->position);
1364                   if (!result)
1365                     return REG_ESPACE;
1366                   ctx->position++;
1367                 }
1368               ctx->re++;
1369               break;
1370
1371             case CHAR_CARET:     /* beginning of line assertion */
1372               /* '^' has a special meaning everywhere in EREs, and in the
1373                  beginning of the RE and after \( is BREs. */
1374               if (ctx->cflags & REG_EXTENDED
1375                   || (ctx->re - 2 >= ctx->re_start
1376                       && *(ctx->re - 2) == CHAR_BACKSLASH
1377                       && *(ctx->re - 1) == CHAR_LPAREN)
1378                   || ctx->re == ctx->re_start)
1379                 {
1380                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
1381                                                ASSERT_AT_BOL, -1);
1382                   if (result == NULL)
1383                     return REG_ESPACE;
1384                   ctx->re++;
1385                 }
1386               else
1387                 goto parse_literal;
1388               break;
1389
1390             case CHAR_DOLLAR:    /* end of line assertion. */
1391               /* '$' is special everywhere in EREs, and in the end of the
1392                  string and before \) is BREs. */
1393               if (ctx->cflags & REG_EXTENDED
1394                   || (*(ctx->re + 1) == CHAR_BACKSLASH
1395                       && *(ctx->re + 2) == CHAR_RPAREN)
1396                   || !*(ctx->re + 1))
1397                 {
1398                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
1399                                                ASSERT_AT_EOL, -1);
1400                   if (result == NULL)
1401                     return REG_ESPACE;
1402                   ctx->re++;
1403                 }
1404               else
1405                 goto parse_literal;
1406               break;
1407
1408             default:
1409             parse_literal:
1410
1411               /* We are expecting an atom.  If the subexpression (or the whole
1412                  regexp ends here, we interpret it as an empty expression
1413                  (which matches an empty string).  */
1414               if (
1415                   (!*ctx->re
1416                    || *ctx->re == CHAR_STAR
1417                    || (ctx->cflags & REG_EXTENDED
1418                        && (*ctx->re == CHAR_PIPE
1419                            || *ctx->re == CHAR_LBRACE
1420                            || *ctx->re == CHAR_PLUS
1421                            || *ctx->re == CHAR_QUESTIONMARK))
1422                    /* Test for "\)" in BRE mode. */
1423                    || (!(ctx->cflags & REG_EXTENDED)
1424                        && !*(ctx->re + 1)
1425                        && *ctx->re == CHAR_BACKSLASH
1426                        && *(ctx->re + 1) == CHAR_LBRACE)))
1427                 {
1428                   result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1429                   if (!result)
1430                     return REG_ESPACE;
1431                   break;
1432                 }
1433
1434               wchar_t wc;
1435               int clen = mbtowc(&wc, ctx->re, -1);
1436               if (clen<0) clen=1, wc=WEOF;
1437
1438               /* Note that we can't use an tre_isalpha() test here, since there
1439                  may be characters which are alphabetic but neither upper or
1440                  lower case. */
1441               if (ctx->cflags & REG_ICASE
1442                   && (tre_isupper(wc) || tre_islower(wc)))
1443                 {
1444                   tre_ast_node_t *tmp1;
1445                   tre_ast_node_t *tmp2;
1446
1447                   /* XXX - Can there be more than one opposite-case
1448                      counterpoints for some character in some locale?  Or
1449                      more than two characters which all should be regarded
1450                      the same character if case is ignored?  If yes, there
1451                      does not seem to be a portable way to detect it.  I guess
1452                      that at least for multi-character collating elements there
1453                      could be several opposite-case counterpoints, but they
1454                      cannot be supported portably anyway. */
1455                   tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
1456                                              tre_toupper(wc),
1457                                              ctx->position);
1458                   if (!tmp1)
1459                     return REG_ESPACE;
1460                   tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
1461                                              tre_tolower(wc),
1462                                              ctx->position);
1463                   if (!tmp2)
1464                     return REG_ESPACE;
1465                   result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1466                   if (!result)
1467                     return REG_ESPACE;
1468                 }
1469               else
1470                 {
1471                   result = tre_ast_new_literal(ctx->mem, wc, wc,
1472                                                ctx->position);
1473                   if (!result)
1474                     return REG_ESPACE;
1475                 }
1476               ctx->position++;
1477               ctx->re += clen;
1478               break;
1479             }
1480           break;
1481
1482         case PARSE_MARK_FOR_SUBMATCH:
1483           {
1484             int submatch_id = tre_stack_pop_int(stack);
1485
1486             if (result->submatch_id >= 0)
1487               {
1488                 tre_ast_node_t *n, *tmp_node;
1489                 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1490                 if (n == NULL)
1491                   return REG_ESPACE;
1492                 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1493                 if (tmp_node == NULL)
1494                   return REG_ESPACE;
1495                 tmp_node->num_submatches = result->num_submatches;
1496                 result = tmp_node;
1497               }
1498             result->submatch_id = submatch_id;
1499             result->num_submatches++;
1500             break;
1501           }
1502
1503         case PARSE_RESTORE_CFLAGS:
1504           ctx->cflags = tre_stack_pop_int(stack);
1505           break;
1506
1507         default:
1508           assert(0);
1509           break;
1510         }
1511     }
1512
1513   /* Check for missing closing parentheses. */
1514   if (depth > 0)
1515     return REG_EPAREN;
1516
1517   if (status == REG_OK)
1518     ctx->result = result;
1519
1520   return status;
1521 }
1522
1523
1524
1525 /***********************************************************************
1526  from tre-compile.c
1527 ***********************************************************************/
1528
1529
1530 /*
1531   TODO:
1532    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1533      function calls.
1534 */
1535
1536 /*
1537   Algorithms to setup tags so that submatch addressing can be done.
1538 */
1539
1540
1541 /* Inserts a catenation node to the root of the tree given in `node'.
1542    As the left child a new tag with number `tag_id' to `node' is added,
1543    and the right child is the old root. */
1544 static reg_errcode_t
1545 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1546 {
1547   tre_catenation_t *c;
1548
1549   c = tre_mem_alloc(mem, sizeof(*c));
1550   if (c == NULL)
1551     return REG_ESPACE;
1552   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1553   if (c->left == NULL)
1554     return REG_ESPACE;
1555   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1556   if (c->right == NULL)
1557     return REG_ESPACE;
1558
1559   c->right->obj = node->obj;
1560   c->right->type = node->type;
1561   c->right->nullable = -1;
1562   c->right->submatch_id = -1;
1563   c->right->firstpos = NULL;
1564   c->right->lastpos = NULL;
1565   c->right->num_tags = 0;
1566   node->obj = c;
1567   node->type = CATENATION;
1568   return REG_OK;
1569 }
1570
1571 /* Inserts a catenation node to the root of the tree given in `node'.
1572    As the right child a new tag with number `tag_id' to `node' is added,
1573    and the left child is the old root. */
1574 static reg_errcode_t
1575 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1576 {
1577   tre_catenation_t *c;
1578
1579   c = tre_mem_alloc(mem, sizeof(*c));
1580   if (c == NULL)
1581     return REG_ESPACE;
1582   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1583   if (c->right == NULL)
1584     return REG_ESPACE;
1585   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1586   if (c->left == NULL)
1587     return REG_ESPACE;
1588
1589   c->left->obj = node->obj;
1590   c->left->type = node->type;
1591   c->left->nullable = -1;
1592   c->left->submatch_id = -1;
1593   c->left->firstpos = NULL;
1594   c->left->lastpos = NULL;
1595   c->left->num_tags = 0;
1596   node->obj = c;
1597   node->type = CATENATION;
1598   return REG_OK;
1599 }
1600
1601 typedef enum {
1602   ADDTAGS_RECURSE,
1603   ADDTAGS_AFTER_ITERATION,
1604   ADDTAGS_AFTER_UNION_LEFT,
1605   ADDTAGS_AFTER_UNION_RIGHT,
1606   ADDTAGS_AFTER_CAT_LEFT,
1607   ADDTAGS_AFTER_CAT_RIGHT,
1608   ADDTAGS_SET_SUBMATCH_END
1609 } tre_addtags_symbol_t;
1610
1611
1612 typedef struct {
1613   int tag;
1614   int next_tag;
1615 } tre_tag_states_t;
1616
1617
1618 /* Go through `regset' and set submatch data for submatches that are
1619    using this tag. */
1620 static void
1621 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1622 {
1623   int i;
1624
1625   for (i = 0; regset[i] >= 0; i++)
1626     {
1627       int id = regset[i] / 2;
1628       int start = !(regset[i] % 2);
1629       if (start)
1630         tnfa->submatch_data[id].so_tag = tag;
1631       else
1632         tnfa->submatch_data[id].eo_tag = tag;
1633     }
1634   regset[0] = -1;
1635 }
1636
1637
1638 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1639    subexpressions marked for submatch addressing can be traced. */
1640 static reg_errcode_t
1641 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1642              tre_tnfa_t *tnfa)
1643 {
1644   reg_errcode_t status = REG_OK;
1645   tre_addtags_symbol_t symbol;
1646   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1647   int bottom = tre_stack_num_objects(stack);
1648   /* True for first pass (counting number of needed tags) */
1649   int first_pass = (mem == NULL || tnfa == NULL);
1650   int *regset, *orig_regset;
1651   int num_tags = 0; /* Total number of tags. */
1652   int num_minimals = 0;  /* Number of special minimal tags. */
1653   int tag = 0;      /* The tag that is to be added next. */
1654   int next_tag = 1; /* Next tag to use after this one. */
1655   int *parents;     /* Stack of submatches the current submatch is
1656                        contained in. */
1657   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1658   tre_tag_states_t *saved_states;
1659
1660   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1661   if (!first_pass)
1662     {
1663       tnfa->end_tag = 0;
1664       tnfa->minimal_tags[0] = -1;
1665     }
1666
1667   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1668   if (regset == NULL)
1669     return REG_ESPACE;
1670   regset[0] = -1;
1671   orig_regset = regset;
1672
1673   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1674   if (parents == NULL)
1675     {
1676       xfree(regset);
1677       return REG_ESPACE;
1678     }
1679   parents[0] = -1;
1680
1681   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1682   if (saved_states == NULL)
1683     {
1684       xfree(regset);
1685       xfree(parents);
1686       return REG_ESPACE;
1687     }
1688   else
1689     {
1690       unsigned int i;
1691       for (i = 0; i <= tnfa->num_submatches; i++)
1692         saved_states[i].tag = -1;
1693     }
1694
1695   STACK_PUSH(stack, voidptr, node);
1696   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1697
1698   while (tre_stack_num_objects(stack) > bottom)
1699     {
1700       if (status != REG_OK)
1701         break;
1702
1703       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1704       switch (symbol)
1705         {
1706
1707         case ADDTAGS_SET_SUBMATCH_END:
1708           {
1709             int id = tre_stack_pop_int(stack);
1710             int i;
1711
1712             /* Add end of this submatch to regset. */
1713             for (i = 0; regset[i] >= 0; i++);
1714             regset[i] = id * 2 + 1;
1715             regset[i + 1] = -1;
1716
1717             /* Pop this submatch from the parents stack. */
1718             for (i = 0; parents[i] >= 0; i++);
1719             parents[i - 1] = -1;
1720             break;
1721           }
1722
1723         case ADDTAGS_RECURSE:
1724           node = tre_stack_pop_voidptr(stack);
1725
1726           if (node->submatch_id >= 0)
1727             {
1728               int id = node->submatch_id;
1729               int i;
1730
1731
1732               /* Add start of this submatch to regset. */
1733               for (i = 0; regset[i] >= 0; i++);
1734               regset[i] = id * 2;
1735               regset[i + 1] = -1;
1736
1737               if (!first_pass)
1738                 {
1739                   for (i = 0; parents[i] >= 0; i++);
1740                   tnfa->submatch_data[id].parents = NULL;
1741                   if (i > 0)
1742                     {
1743                       int *p = xmalloc(sizeof(*p) * (i + 1));
1744                       if (p == NULL)
1745                         {
1746                           status = REG_ESPACE;
1747                           break;
1748                         }
1749                       assert(tnfa->submatch_data[id].parents == NULL);
1750                       tnfa->submatch_data[id].parents = p;
1751                       for (i = 0; parents[i] >= 0; i++)
1752                         p[i] = parents[i];
1753                       p[i] = -1;
1754                     }
1755                 }
1756
1757               /* Add end of this submatch to regset after processing this
1758                  node. */
1759               STACK_PUSHX(stack, int, node->submatch_id);
1760               STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1761             }
1762
1763           switch (node->type)
1764             {
1765             case LITERAL:
1766               {
1767                 tre_literal_t *lit = node->obj;
1768
1769                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1770                   {
1771                     int i;
1772                     if (regset[0] >= 0)
1773                       {
1774                         /* Regset is not empty, so add a tag before the
1775                            literal or backref. */
1776                         if (!first_pass)
1777                           {
1778                             status = tre_add_tag_left(mem, node, tag);
1779                             tnfa->tag_directions[tag] = direction;
1780                             if (minimal_tag >= 0)
1781                               {
1782                                 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1783                                 tnfa->minimal_tags[i] = tag;
1784                                 tnfa->minimal_tags[i + 1] = minimal_tag;
1785                                 tnfa->minimal_tags[i + 2] = -1;
1786                                 minimal_tag = -1;
1787                                 num_minimals++;
1788                               }
1789                             tre_purge_regset(regset, tnfa, tag);
1790                           }
1791                         else
1792                           {
1793                             node->num_tags = 1;
1794                           }
1795
1796                         regset[0] = -1;
1797                         tag = next_tag;
1798                         num_tags++;
1799                         next_tag++;
1800                       }
1801                   }
1802                 else
1803                   {
1804                     assert(!IS_TAG(lit));
1805                   }
1806                 break;
1807               }
1808             case CATENATION:
1809               {
1810                 tre_catenation_t *cat = node->obj;
1811                 tre_ast_node_t *left = cat->left;
1812                 tre_ast_node_t *right = cat->right;
1813                 int reserved_tag = -1;
1814
1815
1816                 /* After processing right child. */
1817                 STACK_PUSHX(stack, voidptr, node);
1818                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1819
1820                 /* Process right child. */
1821                 STACK_PUSHX(stack, voidptr, right);
1822                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1823
1824                 /* After processing left child. */
1825                 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1826                 if (left->num_tags > 0 && right->num_tags > 0)
1827                   {
1828                     /* Reserve the next tag to the right child. */
1829                     reserved_tag = next_tag;
1830                     next_tag++;
1831                   }
1832                 STACK_PUSHX(stack, int, reserved_tag);
1833                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1834
1835                 /* Process left child. */
1836                 STACK_PUSHX(stack, voidptr, left);
1837                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1838
1839                 }
1840               break;
1841             case ITERATION:
1842               {
1843                 tre_iteration_t *iter = node->obj;
1844
1845                 if (first_pass)
1846                   {
1847                     STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1848                   }
1849                 else
1850                   {
1851                     STACK_PUSHX(stack, int, tag);
1852                     STACK_PUSHX(stack, int, iter->minimal);
1853                   }
1854                 STACK_PUSHX(stack, voidptr, node);
1855                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1856
1857                 STACK_PUSHX(stack, voidptr, iter->arg);
1858                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1859
1860                 /* Regset is not empty, so add a tag here. */
1861                 if (regset[0] >= 0 || iter->minimal)
1862                   {
1863                     if (!first_pass)
1864                       {
1865                         int i;
1866                         status = tre_add_tag_left(mem, node, tag);
1867                         if (iter->minimal)
1868                           tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1869                         else
1870                           tnfa->tag_directions[tag] = direction;
1871                         if (minimal_tag >= 0)
1872                           {
1873                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1874                             tnfa->minimal_tags[i] = tag;
1875                             tnfa->minimal_tags[i + 1] = minimal_tag;
1876                             tnfa->minimal_tags[i + 2] = -1;
1877                             minimal_tag = -1;
1878                             num_minimals++;
1879                           }
1880                         tre_purge_regset(regset, tnfa, tag);
1881                       }
1882
1883                     regset[0] = -1;
1884                     tag = next_tag;
1885                     num_tags++;
1886                     next_tag++;
1887                   }
1888                 direction = TRE_TAG_MINIMIZE;
1889               }
1890               break;
1891             case UNION:
1892               {
1893                 tre_union_t *uni = node->obj;
1894                 tre_ast_node_t *left = uni->left;
1895                 tre_ast_node_t *right = uni->right;
1896                 int left_tag;
1897                 int right_tag;
1898
1899                 if (regset[0] >= 0)
1900                   {
1901                     left_tag = next_tag;
1902                     right_tag = next_tag + 1;
1903                   }
1904                 else
1905                   {
1906                     left_tag = tag;
1907                     right_tag = next_tag;
1908                   }
1909
1910                 /* After processing right child. */
1911                 STACK_PUSHX(stack, int, right_tag);
1912                 STACK_PUSHX(stack, int, left_tag);
1913                 STACK_PUSHX(stack, voidptr, regset);
1914                 STACK_PUSHX(stack, int, regset[0] >= 0);
1915                 STACK_PUSHX(stack, voidptr, node);
1916                 STACK_PUSHX(stack, voidptr, right);
1917                 STACK_PUSHX(stack, voidptr, left);
1918                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1919
1920                 /* Process right child. */
1921                 STACK_PUSHX(stack, voidptr, right);
1922                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1923
1924                 /* After processing left child. */
1925                 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1926
1927                 /* Process left child. */
1928                 STACK_PUSHX(stack, voidptr, left);
1929                 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1930
1931                 /* Regset is not empty, so add a tag here. */
1932                 if (regset[0] >= 0)
1933                   {
1934                     if (!first_pass)
1935                       {
1936                         int i;
1937                         status = tre_add_tag_left(mem, node, tag);
1938                         tnfa->tag_directions[tag] = direction;
1939                         if (minimal_tag >= 0)
1940                           {
1941                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1942                             tnfa->minimal_tags[i] = tag;
1943                             tnfa->minimal_tags[i + 1] = minimal_tag;
1944                             tnfa->minimal_tags[i + 2] = -1;
1945                             minimal_tag = -1;
1946                             num_minimals++;
1947                           }
1948                         tre_purge_regset(regset, tnfa, tag);
1949                       }
1950
1951                     regset[0] = -1;
1952                     tag = next_tag;
1953                     num_tags++;
1954                     next_tag++;
1955                   }
1956
1957                 if (node->num_submatches > 0)
1958                   {
1959                     /* The next two tags are reserved for markers. */
1960                     next_tag++;
1961                     tag = next_tag;
1962                     next_tag++;
1963                   }
1964
1965                 break;
1966               }
1967             }
1968
1969           if (node->submatch_id >= 0)
1970             {
1971               int i;
1972               /* Push this submatch on the parents stack. */
1973               for (i = 0; parents[i] >= 0; i++);
1974               parents[i] = node->submatch_id;
1975               parents[i + 1] = -1;
1976             }
1977
1978           break; /* end case: ADDTAGS_RECURSE */
1979
1980         case ADDTAGS_AFTER_ITERATION:
1981           {
1982             int minimal = 0;
1983             int enter_tag;
1984             node = tre_stack_pop_voidptr(stack);
1985             if (first_pass)
1986               {
1987                 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1988                   + tre_stack_pop_int(stack);
1989                 minimal_tag = -1;
1990               }
1991             else
1992               {
1993                 minimal = tre_stack_pop_int(stack);
1994                 enter_tag = tre_stack_pop_int(stack);
1995                 if (minimal)
1996                   minimal_tag = enter_tag;
1997               }
1998
1999             if (!first_pass)
2000               {
2001                 if (minimal)
2002                   direction = TRE_TAG_MINIMIZE;
2003                 else
2004                   direction = TRE_TAG_MAXIMIZE;
2005               }
2006             break;
2007           }
2008
2009         case ADDTAGS_AFTER_CAT_LEFT:
2010           {
2011             int new_tag = tre_stack_pop_int(stack);
2012             next_tag = tre_stack_pop_int(stack);
2013             if (new_tag >= 0)
2014               {
2015                 tag = new_tag;
2016               }
2017             break;
2018           }
2019
2020         case ADDTAGS_AFTER_CAT_RIGHT:
2021           node = tre_stack_pop_voidptr(stack);
2022           if (first_pass)
2023             node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
2024               + ((tre_catenation_t *)node->obj)->right->num_tags;
2025           break;
2026
2027         case ADDTAGS_AFTER_UNION_LEFT:
2028           /* Lift the bottom of the `regset' array so that when processing
2029              the right operand the items currently in the array are
2030              invisible.  The original bottom was saved at ADDTAGS_UNION and
2031              will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
2032           while (*regset >= 0)
2033             regset++;
2034           break;
2035
2036         case ADDTAGS_AFTER_UNION_RIGHT:
2037           {
2038             int added_tags, tag_left, tag_right;
2039             tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
2040             tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
2041             node = tre_stack_pop_voidptr(stack);
2042             added_tags = tre_stack_pop_int(stack);
2043             if (first_pass)
2044               {
2045                 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
2046                   + ((tre_union_t *)node->obj)->right->num_tags + added_tags
2047                   + ((node->num_submatches > 0) ? 2 : 0);
2048               }
2049             regset = tre_stack_pop_voidptr(stack);
2050             tag_left = tre_stack_pop_int(stack);
2051             tag_right = tre_stack_pop_int(stack);
2052
2053             /* Add tags after both children, the left child gets a smaller
2054                tag than the right child.  This guarantees that we prefer
2055                the left child over the right child. */
2056             /* XXX - This is not always necessary (if the children have
2057                tags which must be seen for every match of that child). */
2058             /* XXX - Check if this is the only place where tre_add_tag_right
2059                is used.  If so, use tre_add_tag_left (putting the tag before
2060                the child as opposed after the child) and throw away
2061                tre_add_tag_right. */
2062             if (node->num_submatches > 0)
2063               {
2064                 if (!first_pass)
2065                   {
2066                     status = tre_add_tag_right(mem, left, tag_left);
2067                     tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
2068                     status = tre_add_tag_right(mem, right, tag_right);
2069                     tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
2070                   }
2071                 num_tags += 2;
2072               }
2073             direction = TRE_TAG_MAXIMIZE;
2074             break;
2075           }
2076
2077         default:
2078           assert(0);
2079           break;
2080
2081         } /* end switch(symbol) */
2082     } /* end while(tre_stack_num_objects(stack) > bottom) */
2083
2084   if (!first_pass)
2085     tre_purge_regset(regset, tnfa, tag);
2086
2087   if (!first_pass && minimal_tag >= 0)
2088     {
2089       int i;
2090       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
2091       tnfa->minimal_tags[i] = tag;
2092       tnfa->minimal_tags[i + 1] = minimal_tag;
2093       tnfa->minimal_tags[i + 2] = -1;
2094       minimal_tag = -1;
2095       num_minimals++;
2096     }
2097
2098   assert(tree->num_tags == num_tags);
2099   tnfa->end_tag = num_tags;
2100   tnfa->num_tags = num_tags;
2101   tnfa->num_minimals = num_minimals;
2102   xfree(orig_regset);
2103   xfree(parents);
2104   xfree(saved_states);
2105   return status;
2106 }
2107
2108
2109
2110 /*
2111   AST to TNFA compilation routines.
2112 */
2113
2114 typedef enum {
2115   COPY_RECURSE,
2116   COPY_SET_RESULT_PTR
2117 } tre_copyast_symbol_t;
2118
2119 /* Flags for tre_copy_ast(). */
2120 #define COPY_REMOVE_TAGS         1
2121 #define COPY_MAXIMIZE_FIRST_TAG  2
2122
2123 static reg_errcode_t
2124 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2125              int flags, int *pos_add, tre_tag_direction_t *tag_directions,
2126              tre_ast_node_t **copy, int *max_pos)
2127 {
2128   reg_errcode_t status = REG_OK;
2129   int bottom = tre_stack_num_objects(stack);
2130   int num_copied = 0;
2131   int first_tag = 1;
2132   tre_ast_node_t **result = copy;
2133   tre_copyast_symbol_t symbol;
2134
2135   STACK_PUSH(stack, voidptr, ast);
2136   STACK_PUSH(stack, int, COPY_RECURSE);
2137
2138   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2139     {
2140       tre_ast_node_t *node;
2141       if (status != REG_OK)
2142         break;
2143
2144       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
2145       switch (symbol)
2146         {
2147         case COPY_SET_RESULT_PTR:
2148           result = tre_stack_pop_voidptr(stack);
2149           break;
2150         case COPY_RECURSE:
2151           node = tre_stack_pop_voidptr(stack);
2152           switch (node->type)
2153             {
2154             case LITERAL:
2155               {
2156                 tre_literal_t *lit = node->obj;
2157                 int pos = lit->position;
2158                 int min = lit->code_min;
2159                 int max = lit->code_max;
2160                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2161                   {
2162                     /* XXX - e.g. [ab] has only one position but two
2163                        nodes, so we are creating holes in the state space
2164                        here.  Not fatal, just wastes memory. */
2165                     pos += *pos_add;
2166                     num_copied++;
2167                   }
2168                 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
2169                   {
2170                     /* Change this tag to empty. */
2171                     min = EMPTY;
2172                     max = pos = -1;
2173                   }
2174                 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
2175                          && first_tag)
2176                   {
2177                     /* Maximize the first tag. */
2178                     tag_directions[max] = TRE_TAG_MAXIMIZE;
2179                     first_tag = 0;
2180                   }
2181                 *result = tre_ast_new_literal(mem, min, max, pos);
2182                 if (*result == NULL)
2183                   status = REG_ESPACE;
2184
2185                 if (pos > *max_pos)
2186                   *max_pos = pos;
2187                 break;
2188               }
2189             case UNION:
2190               {
2191                 tre_union_t *uni = node->obj;
2192                 tre_union_t *tmp;
2193                 *result = tre_ast_new_union(mem, uni->left, uni->right);
2194                 if (*result == NULL)
2195                   {
2196                     status = REG_ESPACE;
2197                     break;
2198                   }
2199                 tmp = (*result)->obj;
2200                 result = &tmp->left;
2201                 STACK_PUSHX(stack, voidptr, uni->right);
2202                 STACK_PUSHX(stack, int, COPY_RECURSE);
2203                 STACK_PUSHX(stack, voidptr, &tmp->right);
2204                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2205                 STACK_PUSHX(stack, voidptr, uni->left);
2206                 STACK_PUSHX(stack, int, COPY_RECURSE);
2207                 break;
2208               }
2209             case CATENATION:
2210               {
2211                 tre_catenation_t *cat = node->obj;
2212                 tre_catenation_t *tmp;
2213                 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
2214                 if (*result == NULL)
2215                   {
2216                     status = REG_ESPACE;
2217                     break;
2218                   }
2219                 tmp = (*result)->obj;
2220                 tmp->left = NULL;
2221                 tmp->right = NULL;
2222                 result = &tmp->left;
2223
2224                 STACK_PUSHX(stack, voidptr, cat->right);
2225                 STACK_PUSHX(stack, int, COPY_RECURSE);
2226                 STACK_PUSHX(stack, voidptr, &tmp->right);
2227                 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
2228                 STACK_PUSHX(stack, voidptr, cat->left);
2229                 STACK_PUSHX(stack, int, COPY_RECURSE);
2230                 break;
2231               }
2232             case ITERATION:
2233               {
2234                 tre_iteration_t *iter = node->obj;
2235                 STACK_PUSHX(stack, voidptr, iter->arg);
2236                 STACK_PUSHX(stack, int, COPY_RECURSE);
2237                 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
2238                                            iter->max, iter->minimal);
2239                 if (*result == NULL)
2240                   {
2241                     status = REG_ESPACE;
2242                     break;
2243                   }
2244                 iter = (*result)->obj;
2245                 result = &iter->arg;
2246                 break;
2247               }
2248             default:
2249               assert(0);
2250               break;
2251             }
2252           break;
2253         }
2254     }
2255   *pos_add += num_copied;
2256   return status;
2257 }
2258
2259 typedef enum {
2260   EXPAND_RECURSE,
2261   EXPAND_AFTER_ITER
2262 } tre_expand_ast_symbol_t;
2263
2264 /* Expands each iteration node that has a finite nonzero minimum or maximum
2265    iteration count to a catenated sequence of copies of the node. */
2266 static reg_errcode_t
2267 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
2268                int *position, tre_tag_direction_t *tag_directions,
2269                int *max_depth)
2270 {
2271   reg_errcode_t status = REG_OK;
2272   int bottom = tre_stack_num_objects(stack);
2273   int pos_add = 0;
2274   int pos_add_total = 0;
2275   int max_pos = 0;
2276   int iter_depth = 0;
2277
2278   STACK_PUSHR(stack, voidptr, ast);
2279   STACK_PUSHR(stack, int, EXPAND_RECURSE);
2280   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2281     {
2282       tre_ast_node_t *node;
2283       tre_expand_ast_symbol_t symbol;
2284
2285       if (status != REG_OK)
2286         break;
2287
2288       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
2289       node = tre_stack_pop_voidptr(stack);
2290       switch (symbol)
2291         {
2292         case EXPAND_RECURSE:
2293           switch (node->type)
2294             {
2295             case LITERAL:
2296               {
2297                 tre_literal_t *lit= node->obj;
2298                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
2299                   {
2300                     lit->position += pos_add;
2301                     if (lit->position > max_pos)
2302                       max_pos = lit->position;
2303                   }
2304                 break;
2305               }
2306             case UNION:
2307               {
2308                 tre_union_t *uni = node->obj;
2309                 STACK_PUSHX(stack, voidptr, uni->right);
2310                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2311                 STACK_PUSHX(stack, voidptr, uni->left);
2312                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2313                 break;
2314               }
2315             case CATENATION:
2316               {
2317                 tre_catenation_t *cat = node->obj;
2318                 STACK_PUSHX(stack, voidptr, cat->right);
2319                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2320                 STACK_PUSHX(stack, voidptr, cat->left);
2321                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2322                 break;
2323               }
2324             case ITERATION:
2325               {
2326                 tre_iteration_t *iter = node->obj;
2327                 STACK_PUSHX(stack, int, pos_add);
2328                 STACK_PUSHX(stack, voidptr, node);
2329                 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
2330                 STACK_PUSHX(stack, voidptr, iter->arg);
2331                 STACK_PUSHX(stack, int, EXPAND_RECURSE);
2332                 /* If we are going to expand this node at EXPAND_AFTER_ITER
2333                    then don't increase the `pos' fields of the nodes now, it
2334                    will get done when expanding. */
2335                 if (iter->min > 1 || iter->max > 1)
2336                   pos_add = 0;
2337                 iter_depth++;
2338                 break;
2339               }
2340             default:
2341               assert(0);
2342               break;
2343             }
2344           break;
2345         case EXPAND_AFTER_ITER:
2346           {
2347             tre_iteration_t *iter = node->obj;
2348             int pos_add_last;
2349             pos_add = tre_stack_pop_int(stack);
2350             pos_add_last = pos_add;
2351             if (iter->min > 1 || iter->max > 1)
2352               {
2353                 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
2354                 int j;
2355                 int pos_add_save = pos_add;
2356
2357                 /* Create a catenated sequence of copies of the node. */
2358                 for (j = 0; j < iter->min; j++)
2359                   {
2360                     tre_ast_node_t *copy;
2361                     /* Remove tags from all but the last copy. */
2362                     int flags = ((j + 1 < iter->min)
2363                                  ? COPY_REMOVE_TAGS
2364                                  : COPY_MAXIMIZE_FIRST_TAG);
2365                     pos_add_save = pos_add;
2366                     status = tre_copy_ast(mem, stack, iter->arg, flags,
2367                                           &pos_add, tag_directions, &copy,
2368                                           &max_pos);
2369                     if (status != REG_OK)
2370                       return status;
2371                     if (seq1 != NULL)
2372                       seq1 = tre_ast_new_catenation(mem, seq1, copy);
2373                     else
2374                       seq1 = copy;
2375                     if (seq1 == NULL)
2376                       return REG_ESPACE;
2377                   }
2378
2379                 if (iter->max == -1)
2380                   {
2381                     /* No upper limit. */
2382                     pos_add_save = pos_add;
2383                     status = tre_copy_ast(mem, stack, iter->arg, 0,
2384                                           &pos_add, NULL, &seq2, &max_pos);
2385                     if (status != REG_OK)
2386                       return status;
2387                     seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
2388                     if (seq2 == NULL)
2389                       return REG_ESPACE;
2390                   }
2391                 else
2392                   {
2393                     for (j = iter->min; j < iter->max; j++)
2394                       {
2395                         tre_ast_node_t *tmp, *copy;
2396                         pos_add_save = pos_add;
2397                         status = tre_copy_ast(mem, stack, iter->arg, 0,
2398                                               &pos_add, NULL, &copy, &max_pos);
2399                         if (status != REG_OK)
2400                           return status;
2401                         if (seq2 != NULL)
2402                           seq2 = tre_ast_new_catenation(mem, copy, seq2);
2403                         else
2404                           seq2 = copy;
2405                         if (seq2 == NULL)
2406                           return REG_ESPACE;
2407                         tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
2408                         if (tmp == NULL)
2409                           return REG_ESPACE;
2410                         seq2 = tre_ast_new_union(mem, tmp, seq2);
2411                         if (seq2 == NULL)
2412                           return REG_ESPACE;
2413                       }
2414                   }
2415
2416                 pos_add = pos_add_save;
2417                 if (seq1 == NULL)
2418                   seq1 = seq2;
2419                 else if (seq2 != NULL)
2420                   seq1 = tre_ast_new_catenation(mem, seq1, seq2);
2421                 if (seq1 == NULL)
2422                   return REG_ESPACE;
2423                 node->obj = seq1->obj;
2424                 node->type = seq1->type;
2425               }
2426
2427             iter_depth--;
2428             pos_add_total += pos_add - pos_add_last;
2429             if (iter_depth == 0)
2430               pos_add = pos_add_total;
2431
2432             break;
2433           }
2434         default:
2435           assert(0);
2436           break;
2437         }
2438     }
2439
2440   *position += pos_add_total;
2441
2442   /* `max_pos' should never be larger than `*position' if the above
2443      code works, but just an extra safeguard let's make sure
2444      `*position' is set large enough so enough memory will be
2445      allocated for the transition table. */
2446   if (max_pos > *position)
2447     *position = max_pos;
2448
2449   return status;
2450 }
2451
2452 static tre_pos_and_tags_t *
2453 tre_set_empty(tre_mem_t mem)
2454 {
2455   tre_pos_and_tags_t *new_set;
2456
2457   new_set = tre_mem_calloc(mem, sizeof(*new_set));
2458   if (new_set == NULL)
2459     return NULL;
2460
2461   new_set[0].position = -1;
2462   new_set[0].code_min = -1;
2463   new_set[0].code_max = -1;
2464
2465   return new_set;
2466 }
2467
2468 static tre_pos_and_tags_t *
2469 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2470             tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2471 {
2472   tre_pos_and_tags_t *new_set;
2473
2474   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2475   if (new_set == NULL)
2476     return NULL;
2477
2478   new_set[0].position = position;
2479   new_set[0].code_min = code_min;
2480   new_set[0].code_max = code_max;
2481   new_set[0].class = class;
2482   new_set[0].neg_classes = neg_classes;
2483   new_set[0].backref = backref;
2484   new_set[1].position = -1;
2485   new_set[1].code_min = -1;
2486   new_set[1].code_max = -1;
2487
2488   return new_set;
2489 }
2490
2491 static tre_pos_and_tags_t *
2492 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2493               int *tags, int assertions)
2494 {
2495   int s1, s2, i, j;
2496   tre_pos_and_tags_t *new_set;
2497   int *new_tags;
2498   int num_tags;
2499
2500   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2501   for (s1 = 0; set1[s1].position >= 0; s1++);
2502   for (s2 = 0; set2[s2].position >= 0; s2++);
2503   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2504   if (!new_set )
2505     return NULL;
2506
2507   for (s1 = 0; set1[s1].position >= 0; s1++)
2508     {
2509       new_set[s1].position = set1[s1].position;
2510       new_set[s1].code_min = set1[s1].code_min;
2511       new_set[s1].code_max = set1[s1].code_max;
2512       new_set[s1].assertions = set1[s1].assertions | assertions;
2513       new_set[s1].class = set1[s1].class;
2514       new_set[s1].neg_classes = set1[s1].neg_classes;
2515       new_set[s1].backref = set1[s1].backref;
2516       if (set1[s1].tags == NULL && tags == NULL)
2517         new_set[s1].tags = NULL;
2518       else
2519         {
2520           for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2521           new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2522                                          * (i + num_tags + 1)));
2523           if (new_tags == NULL)
2524             return NULL;
2525           for (j = 0; j < i; j++)
2526             new_tags[j] = set1[s1].tags[j];
2527           for (i = 0; i < num_tags; i++)
2528             new_tags[j + i] = tags[i];
2529           new_tags[j + i] = -1;
2530           new_set[s1].tags = new_tags;
2531         }
2532     }
2533
2534   for (s2 = 0; set2[s2].position >= 0; s2++)
2535     {
2536       new_set[s1 + s2].position = set2[s2].position;
2537       new_set[s1 + s2].code_min = set2[s2].code_min;
2538       new_set[s1 + s2].code_max = set2[s2].code_max;
2539       /* XXX - why not | assertions here as well? */
2540       new_set[s1 + s2].assertions = set2[s2].assertions;
2541       new_set[s1 + s2].class = set2[s2].class;
2542       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2543       new_set[s1 + s2].backref = set2[s2].backref;
2544       if (set2[s2].tags == NULL)
2545         new_set[s1 + s2].tags = NULL;
2546       else
2547         {
2548           for (i = 0; set2[s2].tags[i] >= 0; i++);
2549           new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2550           if (new_tags == NULL)
2551             return NULL;
2552           for (j = 0; j < i; j++)
2553             new_tags[j] = set2[s2].tags[j];
2554           new_tags[j] = -1;
2555           new_set[s1 + s2].tags = new_tags;
2556         }
2557     }
2558   new_set[s1 + s2].position = -1;
2559   return new_set;
2560 }
2561
2562 /* Finds the empty path through `node' which is the one that should be
2563    taken according to POSIX.2 rules, and adds the tags on that path to
2564    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
2565    set to the number of tags seen on the path. */
2566 static reg_errcode_t
2567 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2568                 int *assertions, int *params, int *num_tags_seen,
2569                 int *params_seen)
2570 {
2571   tre_literal_t *lit;
2572   tre_union_t *uni;
2573   tre_catenation_t *cat;
2574   tre_iteration_t *iter;
2575   int i;
2576   int bottom = tre_stack_num_objects(stack);
2577   reg_errcode_t status = REG_OK;
2578   if (num_tags_seen)
2579     *num_tags_seen = 0;
2580
2581   status = tre_stack_push_voidptr(stack, node);
2582
2583   /* Walk through the tree recursively. */
2584   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2585     {
2586       node = tre_stack_pop_voidptr(stack);
2587
2588       switch (node->type)
2589         {
2590         case LITERAL:
2591           lit = (tre_literal_t *)node->obj;
2592           switch (lit->code_min)
2593             {
2594             case TAG:
2595               if (lit->code_max >= 0)
2596                 {
2597                   if (tags != NULL)
2598                     {
2599                       /* Add the tag to `tags'. */
2600                       for (i = 0; tags[i] >= 0; i++)
2601                         if (tags[i] == lit->code_max)
2602                           break;
2603                       if (tags[i] < 0)
2604                         {
2605                           tags[i] = lit->code_max;
2606                           tags[i + 1] = -1;
2607                         }
2608                     }
2609                   if (num_tags_seen)
2610                     (*num_tags_seen)++;
2611                 }
2612               break;
2613             case ASSERTION:
2614               assert(lit->code_max >= 1
2615                      || lit->code_max <= ASSERT_LAST);
2616               if (assertions != NULL)
2617                 *assertions |= lit->code_max;
2618               break;
2619             case EMPTY:
2620               break;
2621             default:
2622               assert(0);
2623               break;
2624             }
2625           break;
2626
2627         case UNION:
2628           /* Subexpressions starting earlier take priority over ones
2629              starting later, so we prefer the left subexpression over the
2630              right subexpression. */
2631           uni = (tre_union_t *)node->obj;
2632           if (uni->left->nullable)
2633             STACK_PUSHX(stack, voidptr, uni->left)
2634           else if (uni->right->nullable)
2635             STACK_PUSHX(stack, voidptr, uni->right)
2636           else
2637             assert(0);
2638           break;
2639
2640         case CATENATION:
2641           /* The path must go through both children. */
2642           cat = (tre_catenation_t *)node->obj;
2643           assert(cat->left->nullable);
2644           assert(cat->right->nullable);
2645           STACK_PUSHX(stack, voidptr, cat->left);
2646           STACK_PUSHX(stack, voidptr, cat->right);
2647           break;
2648
2649         case ITERATION:
2650           /* A match with an empty string is preferred over no match at
2651              all, so we go through the argument if possible. */
2652           iter = (tre_iteration_t *)node->obj;
2653           if (iter->arg->nullable)
2654             STACK_PUSHX(stack, voidptr, iter->arg);
2655           break;
2656
2657         default:
2658           assert(0);
2659           break;
2660         }
2661     }
2662
2663   return status;
2664 }
2665
2666
2667 typedef enum {
2668   NFL_RECURSE,
2669   NFL_POST_UNION,
2670   NFL_POST_CATENATION,
2671   NFL_POST_ITERATION
2672 } tre_nfl_stack_symbol_t;
2673
2674
2675 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2676    the nodes of the AST `tree'. */
2677 static reg_errcode_t
2678 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2679 {
2680   int bottom = tre_stack_num_objects(stack);
2681
2682   STACK_PUSHR(stack, voidptr, tree);
2683   STACK_PUSHR(stack, int, NFL_RECURSE);
2684
2685   while (tre_stack_num_objects(stack) > bottom)
2686     {
2687       tre_nfl_stack_symbol_t symbol;
2688       tre_ast_node_t *node;
2689
2690       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2691       node = tre_stack_pop_voidptr(stack);
2692       switch (symbol)
2693         {
2694         case NFL_RECURSE:
2695           switch (node->type)
2696             {
2697             case LITERAL:
2698               {
2699                 tre_literal_t *lit = (tre_literal_t *)node->obj;
2700                 if (IS_BACKREF(lit))
2701                   {
2702                     /* Back references: nullable = false, firstpos = {i},
2703                        lastpos = {i}. */
2704                     node->nullable = 0;
2705                     node->firstpos = tre_set_one(mem, lit->position, 0,
2706                                              TRE_CHAR_MAX, 0, NULL, -1);
2707                     if (!node->firstpos)
2708                       return REG_ESPACE;
2709                     node->lastpos = tre_set_one(mem, lit->position, 0,
2710                                                 TRE_CHAR_MAX, 0, NULL,
2711                                                 (int)lit->code_max);
2712                     if (!node->lastpos)
2713                       return REG_ESPACE;
2714                   }
2715                 else if (lit->code_min < 0)
2716                   {
2717                     /* Tags, empty strings, params, and zero width assertions:
2718                        nullable = true, firstpos = {}, and lastpos = {}. */
2719                     node->nullable = 1;
2720                     node->firstpos = tre_set_empty(mem);
2721                     if (!node->firstpos)
2722                       return REG_ESPACE;
2723                     node->lastpos = tre_set_empty(mem);
2724                     if (!node->lastpos)
2725                       return REG_ESPACE;
2726                   }
2727                 else
2728                   {
2729                     /* Literal at position i: nullable = false, firstpos = {i},
2730                        lastpos = {i}. */
2731                     node->nullable = 0;
2732                     node->firstpos =
2733                       tre_set_one(mem, lit->position, (int)lit->code_min,
2734                                   (int)lit->code_max, 0, NULL, -1);
2735                     if (!node->firstpos)
2736                       return REG_ESPACE;
2737                     node->lastpos = tre_set_one(mem, lit->position,
2738                                                 (int)lit->code_min,
2739                                                 (int)lit->code_max,
2740                                                 lit->u.class, lit->neg_classes,
2741                                                 -1);
2742                     if (!node->lastpos)
2743                       return REG_ESPACE;
2744                   }
2745                 break;
2746               }
2747
2748             case UNION:
2749               /* Compute the attributes for the two subtrees, and after that
2750                  for this node. */
2751               STACK_PUSHR(stack, voidptr, node);
2752               STACK_PUSHR(stack, int, NFL_POST_UNION);
2753               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2754               STACK_PUSHR(stack, int, NFL_RECURSE);
2755               STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2756               STACK_PUSHR(stack, int, NFL_RECURSE);
2757               break;
2758
2759             case CATENATION:
2760               /* Compute the attributes for the two subtrees, and after that
2761                  for this node. */
2762               STACK_PUSHR(stack, voidptr, node);
2763               STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2764               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2765               STACK_PUSHR(stack, int, NFL_RECURSE);
2766               STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2767               STACK_PUSHR(stack, int, NFL_RECURSE);
2768               break;
2769
2770             case ITERATION:
2771               /* Compute the attributes for the subtree, and after that for
2772                  this node. */
2773               STACK_PUSHR(stack, voidptr, node);
2774               STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2775               STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2776               STACK_PUSHR(stack, int, NFL_RECURSE);
2777               break;
2778             }
2779           break; /* end case: NFL_RECURSE */
2780
2781         case NFL_POST_UNION:
2782           {
2783             tre_union_t *uni = (tre_union_t *)node->obj;
2784             node->nullable = uni->left->nullable || uni->right->nullable;
2785             node->firstpos = tre_set_union(mem, uni->left->firstpos,
2786                                            uni->right->firstpos, NULL, 0);
2787             if (!node->firstpos)
2788               return REG_ESPACE;
2789             node->lastpos = tre_set_union(mem, uni->left->lastpos,
2790                                           uni->right->lastpos, NULL, 0);
2791             if (!node->lastpos)
2792               return REG_ESPACE;
2793             break;
2794           }
2795
2796         case NFL_POST_ITERATION:
2797           {
2798             tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2799
2800             if (iter->min == 0 || iter->arg->nullable)
2801               node->nullable = 1;
2802             else
2803               node->nullable = 0;
2804             node->firstpos = iter->arg->firstpos;
2805             node->lastpos = iter->arg->lastpos;
2806             break;
2807           }
2808
2809         case NFL_POST_CATENATION:
2810           {
2811             int num_tags, *tags, assertions, params_seen;
2812             int *params;
2813             reg_errcode_t status;
2814             tre_catenation_t *cat = node->obj;
2815             node->nullable = cat->left->nullable && cat->right->nullable;
2816
2817             /* Compute firstpos. */
2818             if (cat->left->nullable)
2819               {
2820                 /* The left side matches the empty string.  Make a first pass
2821                    with tre_match_empty() to get the number of tags and
2822                    parameters. */
2823                 status = tre_match_empty(stack, cat->left,
2824                                          NULL, NULL, NULL, &num_tags,
2825                                          &params_seen);
2826                 if (status != REG_OK)
2827                   return status;
2828                 /* Allocate arrays for the tags and parameters. */
2829                 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2830                 if (!tags)
2831                   return REG_ESPACE;
2832                 tags[0] = -1;
2833                 assertions = 0;
2834                 /* Second pass with tre_mach_empty() to get the list of
2835                    tags and parameters. */
2836                 status = tre_match_empty(stack, cat->left, tags,
2837                                          &assertions, params, NULL, NULL);
2838                 if (status != REG_OK)
2839                   {
2840                     xfree(tags);
2841                     return status;
2842                   }
2843                 node->firstpos =
2844                   tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2845                                 tags, assertions);
2846                 xfree(tags);
2847                 if (!node->firstpos)
2848                   return REG_ESPACE;
2849               }
2850             else
2851               {
2852                 node->firstpos = cat->left->firstpos;
2853               }
2854
2855             /* Compute lastpos. */
2856             if (cat->right->nullable)
2857               {
2858                 /* The right side matches the empty string.  Make a first pass
2859                    with tre_match_empty() to get the number of tags and
2860                    parameters. */
2861                 status = tre_match_empty(stack, cat->right,
2862                                          NULL, NULL, NULL, &num_tags,
2863                                          &params_seen);
2864                 if (status != REG_OK)
2865                   return status;
2866                 /* Allocate arrays for the tags and parameters. */
2867                 tags = xmalloc(sizeof(int) * (num_tags + 1));
2868                 if (!tags)
2869                   return REG_ESPACE;
2870                 tags[0] = -1;
2871                 assertions = 0;
2872                 /* Second pass with tre_mach_empty() to get the list of
2873                    tags and parameters. */
2874                 status = tre_match_empty(stack, cat->right, tags,
2875                                          &assertions, params, NULL, NULL);
2876                 if (status != REG_OK)
2877                   {
2878                     xfree(tags);
2879                     return status;
2880                   }
2881                 node->lastpos =
2882                   tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2883                                 tags, assertions);
2884                 xfree(tags);
2885                 if (!node->lastpos)
2886                   return REG_ESPACE;
2887               }
2888             else
2889               {
2890                 node->lastpos = cat->right->lastpos;
2891               }
2892             break;
2893           }
2894
2895         default:
2896           assert(0);
2897           break;
2898         }
2899     }
2900
2901   return REG_OK;
2902 }
2903
2904
2905 /* Adds a transition from each position in `p1' to each position in `p2'. */
2906 static reg_errcode_t
2907 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2908                tre_tnfa_transition_t *transitions,
2909                int *counts, int *offs)
2910 {
2911   tre_pos_and_tags_t *orig_p2 = p2;
2912   tre_tnfa_transition_t *trans;
2913   int i, j, k, l, dup, prev_p2_pos;
2914
2915   if (transitions != NULL)
2916     while (p1->position >= 0)
2917       {
2918         p2 = orig_p2;
2919         prev_p2_pos = -1;
2920         while (p2->position >= 0)
2921           {
2922             /* Optimization: if this position was already handled, skip it. */
2923             if (p2->position == prev_p2_pos)
2924               {
2925                 p2++;
2926                 continue;
2927               }
2928             prev_p2_pos = p2->position;
2929             /* Set `trans' to point to the next unused transition from
2930                position `p1->position'. */
2931             trans = transitions + offs[p1->position];
2932             while (trans->state != NULL)
2933               {
2934 #if 0
2935                 /* If we find a previous transition from `p1->position' to
2936                    `p2->position', it is overwritten.  This can happen only
2937                    if there are nested loops in the regexp, like in "((a)*)*".
2938                    In POSIX.2 repetition using the outer loop is always
2939                    preferred over using the inner loop.  Therefore the
2940                    transition for the inner loop is useless and can be thrown
2941                    away. */
2942                 /* XXX - The same position is used for all nodes in a bracket
2943                    expression, so this optimization cannot be used (it will
2944                    break bracket expressions) unless I figure out a way to
2945                    detect it here. */
2946                 if (trans->state_id == p2->position)
2947                   {
2948                     break;
2949                   }
2950 #endif
2951                 trans++;
2952               }
2953
2954             if (trans->state == NULL)
2955               (trans + 1)->state = NULL;
2956             /* Use the character ranges, assertions, etc. from `p1' for
2957                the transition from `p1' to `p2'. */
2958             trans->code_min = p1->code_min;
2959             trans->code_max = p1->code_max;
2960             trans->state = transitions + offs[p2->position];
2961             trans->state_id = p2->position;
2962             trans->assertions = p1->assertions | p2->assertions
2963               | (p1->class ? ASSERT_CHAR_CLASS : 0)
2964               | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2965             if (p1->backref >= 0)
2966               {
2967                 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2968                 assert(p2->backref < 0);
2969                 trans->u.backref = p1->backref;
2970                 trans->assertions |= ASSERT_BACKREF;
2971               }
2972             else
2973               trans->u.class = p1->class;
2974             if (p1->neg_classes != NULL)
2975               {
2976                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2977                 trans->neg_classes =
2978                   xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2979                 if (trans->neg_classes == NULL)
2980                   return REG_ESPACE;
2981                 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2982                   trans->neg_classes[i] = p1->neg_classes[i];
2983                 trans->neg_classes[i] = (tre_ctype_t)0;
2984               }
2985             else
2986               trans->neg_classes = NULL;
2987
2988             /* Find out how many tags this transition has. */
2989             i = 0;
2990             if (p1->tags != NULL)
2991               while(p1->tags[i] >= 0)
2992                 i++;
2993             j = 0;
2994             if (p2->tags != NULL)
2995               while(p2->tags[j] >= 0)
2996                 j++;
2997
2998             /* If we are overwriting a transition, free the old tag array. */
2999             if (trans->tags != NULL)
3000               xfree(trans->tags);
3001             trans->tags = NULL;
3002
3003             /* If there were any tags, allocate an array and fill it. */
3004             if (i + j > 0)
3005               {
3006                 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
3007                 if (!trans->tags)
3008                   return REG_ESPACE;
3009                 i = 0;
3010                 if (p1->tags != NULL)
3011                   while(p1->tags[i] >= 0)
3012                     {
3013                       trans->tags[i] = p1->tags[i];
3014                       i++;
3015                     }
3016                 l = i;
3017                 j = 0;
3018                 if (p2->tags != NULL)
3019                   while (p2->tags[j] >= 0)
3020                     {
3021                       /* Don't add duplicates. */
3022                       dup = 0;
3023                       for (k = 0; k < i; k++)
3024                         if (trans->tags[k] == p2->tags[j])
3025                           {
3026                             dup = 1;
3027                             break;
3028                           }
3029                       if (!dup)
3030                         trans->tags[l++] = p2->tags[j];
3031                       j++;
3032                     }
3033                 trans->tags[l] = -1;
3034               }
3035
3036             p2++;
3037           }
3038         p1++;
3039       }
3040   else
3041     /* Compute a maximum limit for the number of transitions leaving
3042        from each state. */
3043     while (p1->position >= 0)
3044       {
3045         p2 = orig_p2;
3046         while (p2->position >= 0)
3047           {
3048             counts[p1->position]++;
3049             p2++;
3050           }
3051         p1++;
3052       }
3053   return REG_OK;
3054 }
3055
3056 /* Converts the syntax tree to a TNFA.  All the transitions in the TNFA are
3057    labelled with one character range (there are no transitions on empty
3058    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
3059    the regexp. */
3060 static reg_errcode_t
3061 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
3062                 int *counts, int *offs)
3063 {
3064   tre_union_t *uni;
3065   tre_catenation_t *cat;
3066   tre_iteration_t *iter;
3067   reg_errcode_t errcode = REG_OK;
3068
3069   /* XXX - recurse using a stack!. */
3070   switch (node->type)
3071     {
3072     case LITERAL:
3073       break;
3074     case UNION:
3075       uni = (tre_union_t *)node->obj;
3076       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
3077       if (errcode != REG_OK)
3078         return errcode;
3079       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
3080       break;
3081
3082     case CATENATION:
3083       cat = (tre_catenation_t *)node->obj;
3084       /* Add a transition from each position in cat->left->lastpos
3085          to each position in cat->right->firstpos. */
3086       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
3087                                transitions, counts, offs);
3088       if (errcode != REG_OK)
3089         return errcode;
3090       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
3091       if (errcode != REG_OK)
3092         return errcode;
3093       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
3094       break;
3095
3096     case ITERATION:
3097       iter = (tre_iteration_t *)node->obj;
3098       assert(iter->max == -1 || iter->max == 1);
3099
3100       if (iter->max == -1)
3101         {
3102           assert(iter->min == 0 || iter->min == 1);
3103           /* Add a transition from each last position in the iterated
3104              expression to each first position. */
3105           errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
3106                                    transitions, counts, offs);
3107           if (errcode != REG_OK)
3108             return errcode;
3109         }
3110       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
3111       break;
3112     }
3113   return errcode;
3114 }
3115
3116
3117 #define ERROR_EXIT(err)           \
3118   do                              \
3119     {                             \
3120       errcode = err;              \
3121       if (/*CONSTCOND*/1)         \
3122         goto error_exit;          \
3123     }                             \
3124  while (/*CONSTCOND*/0)
3125
3126
3127 int
3128 regcomp(regex_t *preg, const char *regex, int cflags)
3129 {
3130   tre_stack_t *stack;
3131   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
3132   tre_pos_and_tags_t *p;
3133   int *counts = NULL, *offs = NULL;
3134   int i, add = 0;
3135   tre_tnfa_transition_t *transitions, *initial;
3136   tre_tnfa_t *tnfa = NULL;
3137   tre_submatch_data_t *submatch_data;
3138   tre_tag_direction_t *tag_directions = NULL;
3139   reg_errcode_t errcode;
3140   tre_mem_t mem;
3141
3142   /* Parse context. */
3143   tre_parse_ctx_t parse_ctx;
3144
3145   /* Allocate a stack used throughout the compilation process for various
3146      purposes. */
3147   stack = tre_stack_new(512, 10240, 128);
3148   if (!stack)
3149     return REG_ESPACE;
3150   /* Allocate a fast memory allocator. */
3151   mem = tre_mem_new();
3152   if (!mem)
3153     {
3154       tre_stack_destroy(stack);
3155       return REG_ESPACE;
3156     }
3157
3158   /* Parse the regexp. */
3159   memset(&parse_ctx, 0, sizeof(parse_ctx));
3160   parse_ctx.mem = mem;
3161   parse_ctx.stack = stack;
3162   parse_ctx.re = regex;
3163   parse_ctx.cflags = cflags;
3164   parse_ctx.max_backref = -1;
3165   errcode = tre_parse(&parse_ctx);
3166   if (errcode != REG_OK)
3167     ERROR_EXIT(errcode);
3168   preg->re_nsub = parse_ctx.submatch_id - 1;
3169   tree = parse_ctx.result;
3170
3171   /* Back references and approximate matching cannot currently be used
3172      in the same regexp. */
3173   if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
3174     ERROR_EXIT(REG_BADPAT);
3175
3176 #ifdef TRE_DEBUG
3177   tre_ast_print(tree);
3178 #endif /* TRE_DEBUG */
3179
3180   /* Referring to nonexistent subexpressions is illegal. */
3181   if (parse_ctx.max_backref > (int)preg->re_nsub)
3182     ERROR_EXIT(REG_ESUBREG);
3183
3184   /* Allocate the TNFA struct. */
3185   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
3186   if (tnfa == NULL)
3187     ERROR_EXIT(REG_ESPACE);
3188   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
3189   tnfa->have_approx = parse_ctx.have_approx;
3190   tnfa->num_submatches = parse_ctx.submatch_id;
3191
3192   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
3193      regexp does not have back references, this can be skipped. */
3194   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
3195     {
3196
3197       /* Figure out how many tags we will need. */
3198       errcode = tre_add_tags(NULL, stack, tree, tnfa);
3199       if (errcode != REG_OK)
3200         ERROR_EXIT(errcode);
3201
3202       if (tnfa->num_tags > 0)
3203         {
3204           tag_directions = xmalloc(sizeof(*tag_directions)
3205                                    * (tnfa->num_tags + 1));
3206           if (tag_directions == NULL)
3207             ERROR_EXIT(REG_ESPACE);
3208           tnfa->tag_directions = tag_directions;
3209           memset(tag_directions, -1,
3210                  sizeof(*tag_directions) * (tnfa->num_tags + 1));
3211         }
3212       tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
3213                                    sizeof(tnfa->minimal_tags));
3214       if (tnfa->minimal_tags == NULL)
3215         ERROR_EXIT(REG_ESPACE);
3216
3217       submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
3218                               sizeof(*submatch_data));
3219       if (submatch_data == NULL)
3220         ERROR_EXIT(REG_ESPACE);
3221       tnfa->submatch_data = submatch_data;
3222
3223       errcode = tre_add_tags(mem, stack, tree, tnfa);
3224       if (errcode != REG_OK)
3225         ERROR_EXIT(errcode);
3226
3227     }
3228
3229   /* Expand iteration nodes. */
3230   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
3231                            tag_directions, &tnfa->params_depth);
3232   if (errcode != REG_OK)
3233     ERROR_EXIT(errcode);
3234
3235   /* Add a dummy node for the final state.
3236      XXX - For certain patterns this dummy node can be optimized away,
3237            for example "a*" or "ab*".   Figure out a simple way to detect
3238            this possibility. */
3239   tmp_ast_l = tree;
3240   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
3241   if (tmp_ast_r == NULL)
3242     ERROR_EXIT(REG_ESPACE);
3243
3244   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
3245   if (tree == NULL)
3246     ERROR_EXIT(REG_ESPACE);
3247
3248   errcode = tre_compute_nfl(mem, stack, tree);
3249   if (errcode != REG_OK)
3250     ERROR_EXIT(errcode);
3251
3252   counts = xmalloc(sizeof(int) * parse_ctx.position);
3253   if (counts == NULL)
3254     ERROR_EXIT(REG_ESPACE);
3255
3256   offs = xmalloc(sizeof(int) * parse_ctx.position);
3257   if (offs == NULL)
3258     ERROR_EXIT(REG_ESPACE);
3259
3260   for (i = 0; i < parse_ctx.position; i++)
3261     counts[i] = 0;
3262   tre_ast_to_tnfa(tree, NULL, counts, NULL);
3263
3264   add = 0;
3265   for (i = 0; i < parse_ctx.position; i++)
3266     {
3267       offs[i] = add;
3268       add += counts[i] + 1;
3269       counts[i] = 0;
3270     }
3271   transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
3272   if (transitions == NULL)
3273     ERROR_EXIT(REG_ESPACE);
3274   tnfa->transitions = transitions;
3275   tnfa->num_transitions = add;
3276
3277   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
3278   if (errcode != REG_OK)
3279     ERROR_EXIT(errcode);
3280
3281   tnfa->firstpos_chars = NULL;
3282
3283   p = tree->firstpos;
3284   i = 0;
3285   while (p->position >= 0)
3286     {
3287       i++;
3288       p++;
3289     }
3290
3291   initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
3292   if (initial == NULL)
3293     ERROR_EXIT(REG_ESPACE);
3294   tnfa->initial = initial;
3295
3296   i = 0;
3297   for (p = tree->firstpos; p->position >= 0; p++)
3298     {
3299       initial[i].state = transitions + offs[p->position];
3300       initial[i].state_id = p->position;
3301       initial[i].tags = NULL;
3302       /* Copy the arrays p->tags, and p->params, they are allocated
3303          from a tre_mem object. */
3304       if (p->tags)
3305         {
3306           int j;
3307           for (j = 0; p->tags[j] >= 0; j++);
3308           initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
3309           if (!initial[i].tags)
3310             ERROR_EXIT(REG_ESPACE);
3311           memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
3312         }
3313       initial[i].assertions = p->assertions;
3314       i++;
3315     }
3316   initial[i].state = NULL;
3317
3318   tnfa->num_transitions = add;
3319   tnfa->final = transitions + offs[tree->lastpos[0].position];
3320   tnfa->num_states = parse_ctx.position;
3321   tnfa->cflags = cflags;
3322
3323   tre_mem_destroy(mem);
3324   tre_stack_destroy(stack);
3325   xfree(counts);
3326   xfree(offs);
3327
3328   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3329   return REG_OK;
3330
3331  error_exit:
3332   /* Free everything that was allocated and return the error code. */
3333   tre_mem_destroy(mem);
3334   if (stack != NULL)
3335     tre_stack_destroy(stack);
3336   if (counts != NULL)
3337     xfree(counts);
3338   if (offs != NULL)
3339     xfree(offs);
3340   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3341   regfree(preg);
3342   return errcode;
3343 }
3344
3345
3346
3347
3348 void
3349 regfree(regex_t *preg)
3350 {
3351   tre_tnfa_t *tnfa;
3352   unsigned int i;
3353   tre_tnfa_transition_t *trans;
3354
3355   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
3356   if (!tnfa)
3357     return;
3358
3359   for (i = 0; i < tnfa->num_transitions; i++)
3360     if (tnfa->transitions[i].state)
3361       {
3362         if (tnfa->transitions[i].tags)
3363           xfree(tnfa->transitions[i].tags);
3364         if (tnfa->transitions[i].neg_classes)
3365           xfree(tnfa->transitions[i].neg_classes);
3366       }
3367   if (tnfa->transitions)
3368     xfree(tnfa->transitions);
3369
3370   if (tnfa->initial)
3371     {
3372       for (trans = tnfa->initial; trans->state; trans++)
3373         {
3374           if (trans->tags)
3375             xfree(trans->tags);
3376         }
3377       xfree(tnfa->initial);
3378     }
3379
3380   if (tnfa->submatch_data)
3381     {
3382       for (i = 0; i < tnfa->num_submatches; i++)
3383         if (tnfa->submatch_data[i].parents)
3384           xfree(tnfa->submatch_data[i].parents);
3385       xfree(tnfa->submatch_data);
3386     }
3387
3388   if (tnfa->tag_directions)
3389     xfree(tnfa->tag_directions);
3390   if (tnfa->firstpos_chars)
3391     xfree(tnfa->firstpos_chars);
3392   if (tnfa->minimal_tags)
3393     xfree(tnfa->minimal_tags);
3394   xfree(tnfa);
3395 }