git.cworth.org Git - tar/blob - lib/regcomp.c

   1 /* -*- buffer-read-only: t -*- vi: set ro: */
   2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
   3 /* Extended regular expression matching and search library.
   4    Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Free Software Foundation, Inc.
   5    This file is part of the GNU C Library.
   6    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   7
   8    This program is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 3, or (at your option)
  11    any later version.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License along
  19    with this program; if not, write to the Free Software Foundation,
  20    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  21
  22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  23                                           size_t length, reg_syntax_t syntax);
  24 static void re_compile_fastmap_iter (regex_t *bufp,
  25                                      const re_dfastate_t *init_state,
  26                                      char *fastmap);
  27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (regex_t *preg);
  37 static reg_errcode_t preorder (bin_tree_t *root,
  38                                reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                void *extra);
  40 static reg_errcode_t postorder (bin_tree_t *root,
  41                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  42                                 void *extra);
  43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  46                                  bin_tree_t *node);
  47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  50 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  51 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  52                                    unsigned int constraint);
  53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  55                                          Idx node, bool root);
  56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  57 static Idx fetch_number (re_string_t *input, re_token_t *token,
  58                          reg_syntax_t syntax);
  59 static int peek_token (re_token_t *token, re_string_t *input,
  60                         reg_syntax_t syntax) internal_function;
  61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  62                           reg_syntax_t syntax, reg_errcode_t *err);
  63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  64                                   re_token_t *token, reg_syntax_t syntax,
  65                                   Idx nest, reg_errcode_t *err);
  66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  67                                  re_token_t *token, reg_syntax_t syntax,
  68                                  Idx nest, reg_errcode_t *err);
  69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  70                                      re_token_t *token, reg_syntax_t syntax,
  71                                      Idx nest, reg_errcode_t *err);
  72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  73                                   re_token_t *token, reg_syntax_t syntax,
  74                                   Idx nest, reg_errcode_t *err);
  75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  76                                  re_dfa_t *dfa, re_token_t *token,
  77                                  reg_syntax_t syntax, reg_errcode_t *err);
  78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  79                                       re_token_t *token, reg_syntax_t syntax,
  80                                       reg_errcode_t *err);
  81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  82                                             re_string_t *regexp,
  83                                             re_token_t *token, int token_len,
  84                                             re_dfa_t *dfa,
  85                                             reg_syntax_t syntax,
  86                                             bool accept_hyphen);
  87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  88                                           re_string_t *regexp,
  89                                           re_token_t *token);
  90 #ifdef RE_ENABLE_I18N
  91 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  92                                         re_charset_t *mbcset,
  93                                         Idx *equiv_class_alloc,
  94                                         const unsigned char *name);
  95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  96                                       bitset_t sbcset,
  97                                       re_charset_t *mbcset,
  98                                       Idx *char_class_alloc,
  99                                       const unsigned char *class_name,
 100                                       reg_syntax_t syntax);
 101 #else  /* not RE_ENABLE_I18N */
 102 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 103                                         const unsigned char *name);
 104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 105                                       bitset_t sbcset,
 106                                       const unsigned char *class_name,
 107                                       reg_syntax_t syntax);
 108 #endif /* not RE_ENABLE_I18N */
 109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 110                                        RE_TRANSLATE_TYPE trans,
 111                                        const unsigned char *class_name,
 112                                        const unsigned char *extra,
 113                                        bool non_match, reg_errcode_t *err);
 114 static bin_tree_t *create_tree (re_dfa_t *dfa,
 115                                 bin_tree_t *left, bin_tree_t *right,
 116                                 re_token_type_t type);
 117 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 118                                       bin_tree_t *left, bin_tree_t *right,
 119                                       const re_token_t *token);
 120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 121 static void free_token (re_token_t *node);
 122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 124 \f
 125 /* This table gives an error message for each of the error codes listed
 126    in regex.h.  Obviously the order here has to be same as there.
 127    POSIX doesn't require that we do anything for REG_NOERROR,
 128    but why not be nice?  */
 129
 130 static const char __re_error_msgid[] =
 131   {
 132 #define REG_NOERROR_IDX 0
 133     gettext_noop ("Success")    /* REG_NOERROR */
 134     "\0"
 135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 136     gettext_noop ("No match")   /* REG_NOMATCH */
 137     "\0"
 138 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 139     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 140     "\0"
 141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 142     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 143     "\0"
 144 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 145     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 146     "\0"
 147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 148     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 149     "\0"
 150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 151     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 152     "\0"
 153 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 154     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 155     "\0"
 156 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 157     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 158     "\0"
 159 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 160     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 161     "\0"
 162 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 163     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 164     "\0"
 165 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 166     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 167     "\0"
 168 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 169     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 170     "\0"
 171 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 172     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 173     "\0"
 174 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 175     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 176     "\0"
 177 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 178     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 179     "\0"
 180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 181     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 182   };
 183
 184 static const size_t __re_error_msgid_idx[] =
 185   {
 186     REG_NOERROR_IDX,
 187     REG_NOMATCH_IDX,
 188     REG_BADPAT_IDX,
 189     REG_ECOLLATE_IDX,
 190     REG_ECTYPE_IDX,
 191     REG_EESCAPE_IDX,
 192     REG_ESUBREG_IDX,
 193     REG_EBRACK_IDX,
 194     REG_EPAREN_IDX,
 195     REG_EBRACE_IDX,
 196     REG_BADBR_IDX,
 197     REG_ERANGE_IDX,
 198     REG_ESPACE_IDX,
 199     REG_BADRPT_IDX,
 200     REG_EEND_IDX,
 201     REG_ESIZE_IDX,
 202     REG_ERPAREN_IDX
 203   };
 204 \f
 205 /* Entry points for GNU code.  */
 206
 207 /* re_compile_pattern is the GNU regular expression compiler: it
 208    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 209    Returns 0 if the pattern was valid, otherwise an error string.
 210
 211    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 212    are set in BUFP on entry.  */
 213
 214 #ifdef _LIBC
 215 const char *
 216 re_compile_pattern (pattern, length, bufp)
 217     const char *pattern;
 218     size_t length;
 219     struct re_pattern_buffer *bufp;
 220 #else /* size_t might promote */
 221 const char *
 222 re_compile_pattern (const char *pattern, size_t length,
 223                     struct re_pattern_buffer *bufp)
 224 #endif
 225 {
 226   reg_errcode_t ret;
 227
 228   /* And GNU code determines whether or not to get register information
 229      by passing null for the REGS argument to re_match, etc., not by
 230      setting no_sub, unless RE_NO_SUB is set.  */
 231   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 232
 233   /* Match anchors at newline.  */
 234   bufp->newline_anchor = 1;
 235
 236   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 237
 238   if (!ret)
 239     return NULL;
 240   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 241 }
 242 #ifdef _LIBC
 243 weak_alias (__re_compile_pattern, re_compile_pattern)
 244 #endif
 245
 246 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 247    also be assigned to arbitrarily: each pattern buffer stores its own
 248    syntax, so it can be changed between regex compilations.  */
 249 /* This has no initializer because initialized variables in Emacs
 250    become read-only after dumping.  */
 251 reg_syntax_t re_syntax_options;
 252
 253
 254 /* Specify the precise syntax of regexps for compilation.  This provides
 255    for compatibility for various utilities which historically have
 256    different, incompatible syntaxes.
 257
 258    The argument SYNTAX is a bit mask comprised of the various bits
 259    defined in regex.h.  We return the old syntax.  */
 260
 261 reg_syntax_t
 262 re_set_syntax (syntax)
 263     reg_syntax_t syntax;
 264 {
 265   reg_syntax_t ret = re_syntax_options;
 266
 267   re_syntax_options = syntax;
 268   return ret;
 269 }
 270 #ifdef _LIBC
 271 weak_alias (__re_set_syntax, re_set_syntax)
 272 #endif
 273
 274 int
 275 re_compile_fastmap (bufp)
 276     struct re_pattern_buffer *bufp;
 277 {
 278   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 279   char *fastmap = bufp->fastmap;
 280
 281   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 282   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 283   if (dfa->init_state != dfa->init_state_word)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 285   if (dfa->init_state != dfa->init_state_nl)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 287   if (dfa->init_state != dfa->init_state_begbuf)
 288     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 289   bufp->fastmap_accurate = 1;
 290   return 0;
 291 }
 292 #ifdef _LIBC
 293 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 294 #endif
 295
 296 static inline void
 297 __attribute ((always_inline))
 298 re_set_fastmap (char *fastmap, bool icase, int ch)
 299 {
 300   fastmap[ch] = 1;
 301   if (icase)
 302     fastmap[tolower (ch)] = 1;
 303 }
 304
 305 /* Helper function for re_compile_fastmap.
 306    Compile fastmap for the initial_state INIT_STATE.  */
 307
 308 static void
 309 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 310                          char *fastmap)
 311 {
 312   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 313   Idx node_cnt;
 314   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 315   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 316     {
 317       Idx node = init_state->nodes.elems[node_cnt];
 318       re_token_type_t type = dfa->nodes[node].type;
 319
 320       if (type == CHARACTER)
 321         {
 322           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 323 #ifdef RE_ENABLE_I18N
 324           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 325             {
 326               unsigned char buf[MB_LEN_MAX];
 327               unsigned char *p;
 328               wchar_t wc;
 329               mbstate_t state;
 330
 331               p = buf;
 332               *p++ = dfa->nodes[node].opr.c;
 333               while (++node < dfa->nodes_len
 334                      && dfa->nodes[node].type == CHARACTER
 335                      && dfa->nodes[node].mb_partial)
 336                 *p++ = dfa->nodes[node].opr.c;
 337               memset (&state, '\0', sizeof (state));
 338               if (mbrtowc (&wc, (const char *) buf, p - buf,
 339                            &state) == p - buf
 340                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 341                       != (size_t) -1))
 342                 re_set_fastmap (fastmap, false, buf[0]);
 343             }
 344 #endif
 345         }
 346       else if (type == SIMPLE_BRACKET)
 347         {
 348           int i, ch;
 349           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 350             {
 351               int j;
 352               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 353               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 354                 if (w & ((bitset_word_t) 1 << j))
 355                   re_set_fastmap (fastmap, icase, ch);
 356             }
 357         }
 358 #ifdef RE_ENABLE_I18N
 359       else if (type == COMPLEX_BRACKET)
 360         {
 361           Idx i;
 362           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 363           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 364               || cset->nranges || cset->nchar_classes)
 365             {
 366 # ifdef _LIBC
 367               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 368                 {
 369                   /* In this case we want to catch the bytes which are
 370                      the first byte of any collation elements.
 371                      e.g. In da_DK, we want to catch 'a' since "aa"
 372                           is a valid collation element, and don't catch
 373                           'b' since 'b' is the only collation element
 374                           which starts from 'b'.  */
 375                   const int32_t *table = (const int32_t *)
 376                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 377                   for (i = 0; i < SBC_MAX; ++i)
 378                     if (table[i] < 0)
 379                       re_set_fastmap (fastmap, icase, i);
 380                 }
 381 # else
 382               if (dfa->mb_cur_max > 1)
 383                 for (i = 0; i < SBC_MAX; ++i)
 384                   if (__btowc (i) == WEOF)
 385                     re_set_fastmap (fastmap, icase, i);
 386 # endif /* not _LIBC */
 387             }
 388           for (i = 0; i < cset->nmbchars; ++i)
 389             {
 390               char buf[256];
 391               mbstate_t state;
 392               memset (&state, '\0', sizeof (state));
 393               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 394                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 395               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 396                 {
 397                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 398                       != (size_t) -1)
 399                     re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 400                 }
 401             }
 402         }
 403 #endif /* RE_ENABLE_I18N */
 404       else if (type == OP_PERIOD
 405 #ifdef RE_ENABLE_I18N
 406                || type == OP_UTF8_PERIOD
 407 #endif /* RE_ENABLE_I18N */
 408                || type == END_OF_RE)
 409         {
 410           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 411           if (type == END_OF_RE)
 412             bufp->can_be_null = 1;
 413           return;
 414         }
 415     }
 416 }
 417 \f
 418 /* Entry point for POSIX code.  */
 419 /* regcomp takes a regular expression as a string and compiles it.
 420
 421    PREG is a regex_t *.  We do not expect any fields to be initialized,
 422    since POSIX says we shouldn't.  Thus, we set
 423
 424      `buffer' to the compiled pattern;
 425      `used' to the length of the compiled pattern;
 426      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 427        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 428        RE_SYNTAX_POSIX_BASIC;
 429      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 430      `fastmap' to an allocated space for the fastmap;
 431      `fastmap_accurate' to zero;
 432      `re_nsub' to the number of subexpressions in PATTERN.
 433
 434    PATTERN is the address of the pattern string.
 435
 436    CFLAGS is a series of bits which affect compilation.
 437
 438      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 439      use POSIX basic syntax.
 440
 441      If REG_NEWLINE is set, then . and [^...] don't match newline.
 442      Also, regexec will try a match beginning after every newline.
 443
 444      If REG_ICASE is set, then we considers upper- and lowercase
 445      versions of letters to be equivalent when matching.
 446
 447      If REG_NOSUB is set, then when PREG is passed to regexec, that
 448      routine will report only success or failure, and nothing about the
 449      registers.
 450
 451    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 452    the return codes and their meanings.)  */
 453
 454 int
 455 regcomp (preg, pattern, cflags)
 456     regex_t *_Restrict_ preg;
 457     const char *_Restrict_ pattern;
 458     int cflags;
 459 {
 460   reg_errcode_t ret;
 461   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 462                          : RE_SYNTAX_POSIX_BASIC);
 463
 464   preg->buffer = NULL;
 465   preg->allocated = 0;
 466   preg->used = 0;
 467
 468   /* Try to allocate space for the fastmap.  */
 469   preg->fastmap = re_malloc (char, SBC_MAX);
 470   if (BE (preg->fastmap == NULL, 0))
 471     return REG_ESPACE;
 472
 473   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 474
 475   /* If REG_NEWLINE is set, newlines are treated differently.  */
 476   if (cflags & REG_NEWLINE)
 477     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 478       syntax &= ~RE_DOT_NEWLINE;
 479       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 480       /* It also changes the matching behavior.  */
 481       preg->newline_anchor = 1;
 482     }
 483   else
 484     preg->newline_anchor = 0;
 485   preg->no_sub = !!(cflags & REG_NOSUB);
 486   preg->translate = NULL;
 487
 488   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 489
 490   /* POSIX doesn't distinguish between an unmatched open-group and an
 491      unmatched close-group: both are REG_EPAREN.  */
 492   if (ret == REG_ERPAREN)
 493     ret = REG_EPAREN;
 494
 495   /* We have already checked preg->fastmap != NULL.  */
 496   if (BE (ret == REG_NOERROR, 1))
 497     /* Compute the fastmap now, since regexec cannot modify the pattern
 498        buffer.  This function never fails in this implementation.  */
 499     (void) re_compile_fastmap (preg);
 500   else
 501     {
 502       /* Some error occurred while compiling the expression.  */
 503       re_free (preg->fastmap);
 504       preg->fastmap = NULL;
 505     }
 506
 507   return (int) ret;
 508 }
 509 #ifdef _LIBC
 510 weak_alias (__regcomp, regcomp)
 511 #endif
 512
 513 /* Returns a message corresponding to an error code, ERRCODE, returned
 514    from either regcomp or regexec.   We don't use PREG here.  */
 515
 516 #ifdef _LIBC
 517 size_t
 518 regerror (errcode, preg, errbuf, errbuf_size)
 519     int errcode;
 520     const regex_t *_Restrict_ preg;
 521     char *_Restrict_ errbuf;
 522     size_t errbuf_size;
 523 #else /* size_t might promote */
 524 size_t
 525 regerror (int errcode, const regex_t *_Restrict_ preg,
 526           char *_Restrict_ errbuf, size_t errbuf_size)
 527 #endif
 528 {
 529   const char *msg;
 530   size_t msg_size;
 531
 532   if (BE (errcode < 0
 533           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 534                                / sizeof (__re_error_msgid_idx[0])), 0))
 535     /* Only error codes returned by the rest of the code should be passed
 536        to this routine.  If we are given anything else, or if other regex
 537        code generates an invalid error code, then the program has a bug.
 538        Dump core so we can fix it.  */
 539     abort ();
 540
 541   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 542
 543   msg_size = strlen (msg) + 1; /* Includes the null.  */
 544
 545   if (BE (errbuf_size != 0, 1))
 546     {
 547       size_t cpy_size = msg_size;
 548       if (BE (msg_size > errbuf_size, 0))
 549         {
 550           cpy_size = errbuf_size - 1;
 551           errbuf[cpy_size] = '\0';
 552         }
 553       memcpy (errbuf, msg, cpy_size);
 554     }
 555
 556   return msg_size;
 557 }
 558 #ifdef _LIBC
 559 weak_alias (__regerror, regerror)
 560 #endif
 561
 562
 563 #ifdef RE_ENABLE_I18N
 564 /* This static array is used for the map to single-byte characters when
 565    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 566    it the same all the time.  UTF-8 is the preferred encoding so this is
 567    a worthwhile optimization.  */
 568 static const bitset_t utf8_sb_map =
 569 {
 570   /* Set the first 128 bits.  */
 571 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 572 #  error "bitset_word_t is narrower than 32 bits"
 573 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 574   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 575 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 576   BITSET_WORD_MAX, BITSET_WORD_MAX,
 577 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 578   BITSET_WORD_MAX,
 579 # endif
 580   (BITSET_WORD_MAX
 581    >> (SBC_MAX % BITSET_WORD_BITS == 0
 582        ? 0
 583        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 584 };
 585 #endif
 586
 587
 588 static void
 589 free_dfa_content (re_dfa_t *dfa)
 590 {
 591   Idx i, j;
 592
 593   if (dfa->nodes)
 594     for (i = 0; i < dfa->nodes_len; ++i)
 595       free_token (dfa->nodes + i);
 596   re_free (dfa->nexts);
 597   for (i = 0; i < dfa->nodes_len; ++i)
 598     {
 599       if (dfa->eclosures != NULL)
 600         re_node_set_free (dfa->eclosures + i);
 601       if (dfa->inveclosures != NULL)
 602         re_node_set_free (dfa->inveclosures + i);
 603       if (dfa->edests != NULL)
 604         re_node_set_free (dfa->edests + i);
 605     }
 606   re_free (dfa->edests);
 607   re_free (dfa->eclosures);
 608   re_free (dfa->inveclosures);
 609   re_free (dfa->nodes);
 610
 611   if (dfa->state_table)
 612     for (i = 0; i <= dfa->state_hash_mask; ++i)
 613       {
 614         struct re_state_table_entry *entry = dfa->state_table + i;
 615         for (j = 0; j < entry->num; ++j)
 616           {
 617             re_dfastate_t *state = entry->array[j];
 618             free_state (state);
 619           }
 620         re_free (entry->array);
 621       }
 622   re_free (dfa->state_table);
 623 #ifdef RE_ENABLE_I18N
 624   if (dfa->sb_char != utf8_sb_map)
 625     re_free (dfa->sb_char);
 626 #endif
 627   re_free (dfa->subexp_map);
 628 #ifdef DEBUG
 629   re_free (dfa->re_str);
 630 #endif
 631
 632   re_free (dfa);
 633 }
 634
 635
 636 /* Free dynamically allocated space used by PREG.  */
 637
 638 void
 639 regfree (preg)
 640     regex_t *preg;
 641 {
 642   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 643   if (BE (dfa != NULL, 1))
 644     free_dfa_content (dfa);
 645   preg->buffer = NULL;
 646   preg->allocated = 0;
 647
 648   re_free (preg->fastmap);
 649   preg->fastmap = NULL;
 650
 651   re_free (preg->translate);
 652   preg->translate = NULL;
 653 }
 654 #ifdef _LIBC
 655 weak_alias (__regfree, regfree)
 656 #endif
 657 \f
 658 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 659    them unless specifically requested.  */
 660
 661 #if defined _REGEX_RE_COMP || defined _LIBC
 662
 663 /* BSD has one and only one pattern buffer.  */
 664 static struct re_pattern_buffer re_comp_buf;
 665
 666 char *
 667 # ifdef _LIBC
 668 /* Make these definitions weak in libc, so POSIX programs can redefine
 669    these names if they don't use our functions, and still use
 670    regcomp/regexec above without link errors.  */
 671 weak_function
 672 # endif
 673 re_comp (s)
 674      const char *s;
 675 {
 676   reg_errcode_t ret;
 677   char *fastmap;
 678
 679   if (!s)
 680     {
 681       if (!re_comp_buf.buffer)
 682         return gettext ("No previous regular expression");
 683       return 0;
 684     }
 685
 686   if (re_comp_buf.buffer)
 687     {
 688       fastmap = re_comp_buf.fastmap;
 689       re_comp_buf.fastmap = NULL;
 690       __regfree (&re_comp_buf);
 691       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 692       re_comp_buf.fastmap = fastmap;
 693     }
 694
 695   if (re_comp_buf.fastmap == NULL)
 696     {
 697       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 698       if (re_comp_buf.fastmap == NULL)
 699         return (char *) gettext (__re_error_msgid
 700                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 701     }
 702
 703   /* Since `re_exec' always passes NULL for the `regs' argument, we
 704      don't need to initialize the pattern buffer fields which affect it.  */
 705
 706   /* Match anchors at newlines.  */
 707   re_comp_buf.newline_anchor = 1;
 708
 709   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 710
 711   if (!ret)
 712     return NULL;
 713
 714   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 715   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 716 }
 717
 718 #ifdef _LIBC
 719 libc_freeres_fn (free_mem)
 720 {
 721   __regfree (&re_comp_buf);
 722 }
 723 #endif
 724
 725 #endif /* _REGEX_RE_COMP */
 726 \f
 727 /* Internal entry point.
 728    Compile the regular expression PATTERN, whose length is LENGTH.
 729    SYNTAX indicate regular expression's syntax.  */
 730
 731 static reg_errcode_t
 732 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 733                      reg_syntax_t syntax)
 734 {
 735   reg_errcode_t err = REG_NOERROR;
 736   re_dfa_t *dfa;
 737   re_string_t regexp;
 738
 739   /* Initialize the pattern buffer.  */
 740   preg->fastmap_accurate = 0;
 741   preg->syntax = syntax;
 742   preg->not_bol = preg->not_eol = 0;
 743   preg->used = 0;
 744   preg->re_nsub = 0;
 745   preg->can_be_null = 0;
 746   preg->regs_allocated = REGS_UNALLOCATED;
 747
 748   /* Initialize the dfa.  */
 749   dfa = (re_dfa_t *) preg->buffer;
 750   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 751     {
 752       /* If zero allocated, but buffer is non-null, try to realloc
 753          enough space.  This loses if buffer's address is bogus, but
 754          that is the user's responsibility.  If ->buffer is NULL this
 755          is a simple allocation.  */
 756       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 757       if (dfa == NULL)
 758         return REG_ESPACE;
 759       preg->allocated = sizeof (re_dfa_t);
 760       preg->buffer = (unsigned char *) dfa;
 761     }
 762   preg->used = sizeof (re_dfa_t);
 763
 764   err = init_dfa (dfa, length);
 765   if (BE (err != REG_NOERROR, 0))
 766     {
 767       free_dfa_content (dfa);
 768       preg->buffer = NULL;
 769       preg->allocated = 0;
 770       return err;
 771     }
 772 #ifdef DEBUG
 773   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 774   dfa->re_str = re_malloc (char, length + 1);
 775   strncpy (dfa->re_str, pattern, length + 1);
 776 #endif
 777
 778   __libc_lock_init (dfa->lock);
 779
 780   err = re_string_construct (&regexp, pattern, length, preg->translate,
 781                              (syntax & RE_ICASE) != 0, dfa);
 782   if (BE (err != REG_NOERROR, 0))
 783     {
 784     re_compile_internal_free_return:
 785       free_workarea_compile (preg);
 786       re_string_destruct (&regexp);
 787       free_dfa_content (dfa);
 788       preg->buffer = NULL;
 789       preg->allocated = 0;
 790       return err;
 791     }
 792
 793   /* Parse the regular expression, and build a structure tree.  */
 794   preg->re_nsub = 0;
 795   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 796   if (BE (dfa->str_tree == NULL, 0))
 797     goto re_compile_internal_free_return;
 798
 799   /* Analyze the tree and create the nfa.  */
 800   err = analyze (preg);
 801   if (BE (err != REG_NOERROR, 0))
 802     goto re_compile_internal_free_return;
 803
 804 #ifdef RE_ENABLE_I18N
 805   /* If possible, do searching in single byte encoding to speed things up.  */
 806   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 807     optimize_utf8 (dfa);
 808 #endif
 809
 810   /* Then create the initial state of the dfa.  */
 811   err = create_initial_state (dfa);
 812
 813   /* Release work areas.  */
 814   free_workarea_compile (preg);
 815   re_string_destruct (&regexp);
 816
 817   if (BE (err != REG_NOERROR, 0))
 818     {
 819       free_dfa_content (dfa);
 820       preg->buffer = NULL;
 821       preg->allocated = 0;
 822     }
 823
 824   return err;
 825 }
 826
 827 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 828    as the initial length of some arrays.  */
 829
 830 static reg_errcode_t
 831 init_dfa (re_dfa_t *dfa, size_t pat_len)
 832 {
 833   __re_size_t table_size;
 834 #ifdef RE_ENABLE_I18N
 835   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 836 #else
 837   size_t max_i18n_object_size = 0;
 838 #endif
 839   size_t max_object_size =
 840     MAX (sizeof (struct re_state_table_entry),
 841          MAX (sizeof (re_token_t),
 842               MAX (sizeof (re_node_set),
 843                    MAX (sizeof (regmatch_t),
 844                         max_i18n_object_size))));
 845
 846   memset (dfa, '\0', sizeof (re_dfa_t));
 847
 848   /* Force allocation of str_tree_storage the first time.  */
 849   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 850
 851   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 852      calculation below, and for similar doubling calculations
 853      elsewhere.  And it's <= rather than <, because some of the
 854      doubling calculations add 1 afterwards.  */
 855   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 856     return REG_ESPACE;
 857
 858   dfa->nodes_alloc = pat_len + 1;
 859   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 860
 861   /*  table_size = 2 ^ ceil(log pat_len) */
 862   for (table_size = 1; ; table_size <<= 1)
 863     if (table_size > pat_len)
 864       break;
 865
 866   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 867   dfa->state_hash_mask = table_size - 1;
 868
 869   dfa->mb_cur_max = MB_CUR_MAX;
 870 #ifdef _LIBC
 871   if (dfa->mb_cur_max == 6
 872       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 873     dfa->is_utf8 = 1;
 874   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 875                        != 0);
 876 #else
 877   if (strcmp (locale_charset (), "UTF-8") == 0)
 878     dfa->is_utf8 = 1;
 879
 880   /* We check exhaustively in the loop below if this charset is a
 881      superset of ASCII.  */
 882   dfa->map_notascii = 0;
 883 #endif
 884
 885 #ifdef RE_ENABLE_I18N
 886   if (dfa->mb_cur_max > 1)
 887     {
 888       if (dfa->is_utf8)
 889         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 890       else
 891         {
 892           int i, j, ch;
 893
 894           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 895           if (BE (dfa->sb_char == NULL, 0))
 896             return REG_ESPACE;
 897
 898           /* Set the bits corresponding to single byte chars.  */
 899           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 900             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 901               {
 902                 wint_t wch = __btowc (ch);
 903                 if (wch != WEOF)
 904                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 905 # ifndef _LIBC
 906                 if (isascii (ch) && wch != ch)
 907                   dfa->map_notascii = 1;
 908 # endif
 909               }
 910         }
 911     }
 912 #endif
 913
 914   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 915     return REG_ESPACE;
 916   return REG_NOERROR;
 917 }
 918
 919 /* Initialize WORD_CHAR table, which indicate which character is
 920    "word".  In this case "word" means that it is the word construction
 921    character used by some operators like "\<", "\>", etc.  */
 922
 923 static void
 924 internal_function
 925 init_word_char (re_dfa_t *dfa)
 926 {
 927   int i, j, ch;
 928   dfa->word_ops_used = 1;
 929   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 930     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 931       if (isalnum (ch) || ch == '_')
 932         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 933 }
 934
 935 /* Free the work area which are only used while compiling.  */
 936
 937 static void
 938 free_workarea_compile (regex_t *preg)
 939 {
 940   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 941   bin_tree_storage_t *storage, *next;
 942   for (storage = dfa->str_tree_storage; storage; storage = next)
 943     {
 944       next = storage->next;
 945       re_free (storage);
 946     }
 947   dfa->str_tree_storage = NULL;
 948   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 949   dfa->str_tree = NULL;
 950   re_free (dfa->org_indices);
 951   dfa->org_indices = NULL;
 952 }
 953
 954 /* Create initial states for all contexts.  */
 955
 956 static reg_errcode_t
 957 create_initial_state (re_dfa_t *dfa)
 958 {
 959   Idx first, i;
 960   reg_errcode_t err;
 961   re_node_set init_nodes;
 962
 963   /* Initial states have the epsilon closure of the node which is
 964      the first node of the regular expression.  */
 965   first = dfa->str_tree->first->node_idx;
 966   dfa->init_node = first;
 967   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 968   if (BE (err != REG_NOERROR, 0))
 969     return err;
 970
 971   /* The back-references which are in initial states can epsilon transit,
 972      since in this case all of the subexpressions can be null.
 973      Then we add epsilon closures of the nodes which are the next nodes of
 974      the back-references.  */
 975   if (dfa->nbackref > 0)
 976     for (i = 0; i < init_nodes.nelem; ++i)
 977       {
 978         Idx node_idx = init_nodes.elems[i];
 979         re_token_type_t type = dfa->nodes[node_idx].type;
 980
 981         Idx clexp_idx;
 982         if (type != OP_BACK_REF)
 983           continue;
 984         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 985           {
 986             re_token_t *clexp_node;
 987             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 988             if (clexp_node->type == OP_CLOSE_SUBEXP
 989                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 990               break;
 991           }
 992         if (clexp_idx == init_nodes.nelem)
 993           continue;
 994
 995         if (type == OP_BACK_REF)
 996           {
 997             Idx dest_idx = dfa->edests[node_idx].elems[0];
 998             if (!re_node_set_contains (&init_nodes, dest_idx))
 999               {
1000                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1001                 i = 0;
1002               }
1003           }
1004       }
1005
1006   /* It must be the first time to invoke acquire_state.  */
1007   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1008   /* We don't check ERR here, since the initial state must not be NULL.  */
1009   if (BE (dfa->init_state == NULL, 0))
1010     return err;
1011   if (dfa->init_state->has_constraint)
1012     {
1013       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1014                                                        CONTEXT_WORD);
1015       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1016                                                      CONTEXT_NEWLINE);
1017       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1018                                                          &init_nodes,
1019                                                          CONTEXT_NEWLINE
1020                                                          | CONTEXT_BEGBUF);
1021       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1022               || dfa->init_state_begbuf == NULL, 0))
1023         return err;
1024     }
1025   else
1026     dfa->init_state_word = dfa->init_state_nl
1027       = dfa->init_state_begbuf = dfa->init_state;
1028
1029   re_node_set_free (&init_nodes);
1030   return REG_NOERROR;
1031 }
1032 \f
1033 #ifdef RE_ENABLE_I18N
1034 /* If it is possible to do searching in single byte encoding instead of UTF-8
1035    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1036    DFA nodes where needed.  */
1037
1038 static void
1039 optimize_utf8 (re_dfa_t *dfa)
1040 {
1041   Idx node;
1042   int i;
1043   bool mb_chars = false;
1044   bool has_period = false;
1045
1046   for (node = 0; node < dfa->nodes_len; ++node)
1047     switch (dfa->nodes[node].type)
1048       {
1049       case CHARACTER:
1050         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1051           mb_chars = true;
1052         break;
1053       case ANCHOR:
1054         switch (dfa->nodes[node].opr.ctx_type)
1055           {
1056           case LINE_FIRST:
1057           case LINE_LAST:
1058           case BUF_FIRST:
1059           case BUF_LAST:
1060             break;
1061           default:
1062             /* Word anchors etc. cannot be handled.  It's okay to test
1063                opr.ctx_type since constraints (for all DFA nodes) are
1064                created by ORing one or more opr.ctx_type values.  */
1065             return;
1066           }
1067         break;
1068       case OP_PERIOD:
1069         has_period = true;
1070         break;
1071       case OP_BACK_REF:
1072       case OP_ALT:
1073       case END_OF_RE:
1074       case OP_DUP_ASTERISK:
1075       case OP_OPEN_SUBEXP:
1076       case OP_CLOSE_SUBEXP:
1077         break;
1078       case COMPLEX_BRACKET:
1079         return;
1080       case SIMPLE_BRACKET:
1081         /* Just double check.  */
1082         {
1083           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1084                         ? 0
1085                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1086           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1087             {
1088               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1089                 return;
1090               rshift = 0;
1091             }
1092         }
1093         break;
1094       default:
1095         abort ();
1096       }
1097
1098   if (mb_chars || has_period)
1099     for (node = 0; node < dfa->nodes_len; ++node)
1100       {
1101         if (dfa->nodes[node].type == CHARACTER
1102             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1103           dfa->nodes[node].mb_partial = 0;
1104         else if (dfa->nodes[node].type == OP_PERIOD)
1105           dfa->nodes[node].type = OP_UTF8_PERIOD;
1106       }
1107
1108   /* The search can be in single byte locale.  */
1109   dfa->mb_cur_max = 1;
1110   dfa->is_utf8 = 0;
1111   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1112 }
1113 #endif
1114 \f
1115 /* Analyze the structure tree, and calculate "first", "next", "edest",
1116    "eclosure", and "inveclosure".  */
1117
1118 static reg_errcode_t
1119 analyze (regex_t *preg)
1120 {
1121   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1122   reg_errcode_t ret;
1123
1124   /* Allocate arrays.  */
1125   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1126   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1127   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1128   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1129   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1130           || dfa->eclosures == NULL, 0))
1131     return REG_ESPACE;
1132
1133   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1134   if (dfa->subexp_map != NULL)
1135     {
1136       Idx i;
1137       for (i = 0; i < preg->re_nsub; i++)
1138         dfa->subexp_map[i] = i;
1139       preorder (dfa->str_tree, optimize_subexps, dfa);
1140       for (i = 0; i < preg->re_nsub; i++)
1141         if (dfa->subexp_map[i] != i)
1142           break;
1143       if (i == preg->re_nsub)
1144         {
1145           free (dfa->subexp_map);
1146           dfa->subexp_map = NULL;
1147         }
1148     }
1149
1150   ret = postorder (dfa->str_tree, lower_subexps, preg);
1151   if (BE (ret != REG_NOERROR, 0))
1152     return ret;
1153   ret = postorder (dfa->str_tree, calc_first, dfa);
1154   if (BE (ret != REG_NOERROR, 0))
1155     return ret;
1156   preorder (dfa->str_tree, calc_next, dfa);
1157   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1158   if (BE (ret != REG_NOERROR, 0))
1159     return ret;
1160   ret = calc_eclosure (dfa);
1161   if (BE (ret != REG_NOERROR, 0))
1162     return ret;
1163
1164   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1165      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1166   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1167       || dfa->nbackref)
1168     {
1169       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1170       if (BE (dfa->inveclosures == NULL, 0))
1171         return REG_ESPACE;
1172       ret = calc_inveclosure (dfa);
1173     }
1174
1175   return ret;
1176 }
1177
1178 /* Our parse trees are very unbalanced, so we cannot use a stack to
1179    implement parse tree visits.  Instead, we use parent pointers and
1180    some hairy code in these two functions.  */
1181 static reg_errcode_t
1182 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1183            void *extra)
1184 {
1185   bin_tree_t *node, *prev;
1186
1187   for (node = root; ; )
1188     {
1189       /* Descend down the tree, preferably to the left (or to the right
1190          if that's the only child).  */
1191       while (node->left || node->right)
1192         if (node->left)
1193           node = node->left;
1194         else
1195           node = node->right;
1196
1197       do
1198         {
1199           reg_errcode_t err = fn (extra, node);
1200           if (BE (err != REG_NOERROR, 0))
1201             return err;
1202           if (node->parent == NULL)
1203             return REG_NOERROR;
1204           prev = node;
1205           node = node->parent;
1206         }
1207       /* Go up while we have a node that is reached from the right.  */
1208       while (node->right == prev || node->right == NULL);
1209       node = node->right;
1210     }
1211 }
1212
1213 static reg_errcode_t
1214 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1215           void *extra)
1216 {
1217   bin_tree_t *node;
1218
1219   for (node = root; ; )
1220     {
1221       reg_errcode_t err = fn (extra, node);
1222       if (BE (err != REG_NOERROR, 0))
1223         return err;
1224
1225       /* Go to the left node, or up and to the right.  */
1226       if (node->left)
1227         node = node->left;
1228       else
1229         {
1230           bin_tree_t *prev = NULL;
1231           while (node->right == prev || node->right == NULL)
1232             {
1233               prev = node;
1234               node = node->parent;
1235               if (!node)
1236                 return REG_NOERROR;
1237             }
1238           node = node->right;
1239         }
1240     }
1241 }
1242
1243 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1244    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1245    backreferences as well.  Requires a preorder visit.  */
1246 static reg_errcode_t
1247 optimize_subexps (void *extra, bin_tree_t *node)
1248 {
1249   re_dfa_t *dfa = (re_dfa_t *) extra;
1250
1251   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1252     {
1253       int idx = node->token.opr.idx;
1254       node->token.opr.idx = dfa->subexp_map[idx];
1255       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1256     }
1257
1258   else if (node->token.type == SUBEXP
1259            && node->left && node->left->token.type == SUBEXP)
1260     {
1261       Idx other_idx = node->left->token.opr.idx;
1262
1263       node->left = node->left->left;
1264       if (node->left)
1265         node->left->parent = node;
1266
1267       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1268       if (other_idx < BITSET_WORD_BITS)
1269         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1270     }
1271
1272   return REG_NOERROR;
1273 }
1274
1275 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1276    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1277 static reg_errcode_t
1278 lower_subexps (void *extra, bin_tree_t *node)
1279 {
1280   regex_t *preg = (regex_t *) extra;
1281   reg_errcode_t err = REG_NOERROR;
1282
1283   if (node->left && node->left->token.type == SUBEXP)
1284     {
1285       node->left = lower_subexp (&err, preg, node->left);
1286       if (node->left)
1287         node->left->parent = node;
1288     }
1289   if (node->right && node->right->token.type == SUBEXP)
1290     {
1291       node->right = lower_subexp (&err, preg, node->right);
1292       if (node->right)
1293         node->right->parent = node;
1294     }
1295
1296   return err;
1297 }
1298
1299 static bin_tree_t *
1300 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1301 {
1302   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1303   bin_tree_t *body = node->left;
1304   bin_tree_t *op, *cls, *tree1, *tree;
1305
1306   if (preg->no_sub
1307       /* We do not optimize empty subexpressions, because otherwise we may
1308          have bad CONCAT nodes with NULL children.  This is obviously not
1309          very common, so we do not lose much.  An example that triggers
1310          this case is the sed "script" /\(\)/x.  */
1311       && node->left != NULL
1312       && (node->token.opr.idx >= BITSET_WORD_BITS
1313           || !(dfa->used_bkref_map
1314                & ((bitset_word_t) 1 << node->token.opr.idx))))
1315     return node->left;
1316
1317   /* Convert the SUBEXP node to the concatenation of an
1318      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1319   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1320   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1321   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1322   tree = create_tree (dfa, op, tree1, CONCAT);
1323   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1324     {
1325       *err = REG_ESPACE;
1326       return NULL;
1327     }
1328
1329   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1330   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1331   return tree;
1332 }
1333
1334 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1335    nodes.  Requires a postorder visit.  */
1336 static reg_errcode_t
1337 calc_first (void *extra, bin_tree_t *node)
1338 {
1339   re_dfa_t *dfa = (re_dfa_t *) extra;
1340   if (node->token.type == CONCAT)
1341     {
1342       node->first = node->left->first;
1343       node->node_idx = node->left->node_idx;
1344     }
1345   else
1346     {
1347       node->first = node;
1348       node->node_idx = re_dfa_add_node (dfa, node->token);
1349       if (BE (node->node_idx == REG_MISSING, 0))
1350         return REG_ESPACE;
1351       if (node->token.type == ANCHOR)
1352         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1353     }
1354   return REG_NOERROR;
1355 }
1356
1357 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1358 static reg_errcode_t
1359 calc_next (void *extra, bin_tree_t *node)
1360 {
1361   switch (node->token.type)
1362     {
1363     case OP_DUP_ASTERISK:
1364       node->left->next = node;
1365       break;
1366     case CONCAT:
1367       node->left->next = node->right->first;
1368       node->right->next = node->next;
1369       break;
1370     default:
1371       if (node->left)
1372         node->left->next = node->next;
1373       if (node->right)
1374         node->right->next = node->next;
1375       break;
1376     }
1377   return REG_NOERROR;
1378 }
1379
1380 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1381 static reg_errcode_t
1382 link_nfa_nodes (void *extra, bin_tree_t *node)
1383 {
1384   re_dfa_t *dfa = (re_dfa_t *) extra;
1385   Idx idx = node->node_idx;
1386   reg_errcode_t err = REG_NOERROR;
1387
1388   switch (node->token.type)
1389     {
1390     case CONCAT:
1391       break;
1392
1393     case END_OF_RE:
1394       assert (node->next == NULL);
1395       break;
1396
1397     case OP_DUP_ASTERISK:
1398     case OP_ALT:
1399       {
1400         Idx left, right;
1401         dfa->has_plural_match = 1;
1402         if (node->left != NULL)
1403           left = node->left->first->node_idx;
1404         else
1405           left = node->next->node_idx;
1406         if (node->right != NULL)
1407           right = node->right->first->node_idx;
1408         else
1409           right = node->next->node_idx;
1410         assert (REG_VALID_INDEX (left));
1411         assert (REG_VALID_INDEX (right));
1412         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1413       }
1414       break;
1415
1416     case ANCHOR:
1417     case OP_OPEN_SUBEXP:
1418     case OP_CLOSE_SUBEXP:
1419       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1420       break;
1421
1422     case OP_BACK_REF:
1423       dfa->nexts[idx] = node->next->node_idx;
1424       if (node->token.type == OP_BACK_REF)
1425         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1426       break;
1427
1428     default:
1429       assert (!IS_EPSILON_NODE (node->token.type));
1430       dfa->nexts[idx] = node->next->node_idx;
1431       break;
1432     }
1433
1434   return err;
1435 }
1436
1437 /* Duplicate the epsilon closure of the node ROOT_NODE.
1438    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1439    to their own constraint.  */
1440
1441 static reg_errcode_t
1442 internal_function
1443 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1444                         Idx root_node, unsigned int init_constraint)
1445 {
1446   Idx org_node, clone_node;
1447   bool ok;
1448   unsigned int constraint = init_constraint;
1449   for (org_node = top_org_node, clone_node = top_clone_node;;)
1450     {
1451       Idx org_dest, clone_dest;
1452       if (dfa->nodes[org_node].type == OP_BACK_REF)
1453         {
1454           /* If the back reference epsilon-transit, its destination must
1455              also have the constraint.  Then duplicate the epsilon closure
1456              of the destination of the back reference, and store it in
1457              edests of the back reference.  */
1458           org_dest = dfa->nexts[org_node];
1459           re_node_set_empty (dfa->edests + clone_node);
1460           clone_dest = duplicate_node (dfa, org_dest, constraint);
1461           if (BE (clone_dest == REG_MISSING, 0))
1462             return REG_ESPACE;
1463           dfa->nexts[clone_node] = dfa->nexts[org_node];
1464           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1465           if (BE (! ok, 0))
1466             return REG_ESPACE;
1467         }
1468       else if (dfa->edests[org_node].nelem == 0)
1469         {
1470           /* In case of the node can't epsilon-transit, don't duplicate the
1471              destination and store the original destination as the
1472              destination of the node.  */
1473           dfa->nexts[clone_node] = dfa->nexts[org_node];
1474           break;
1475         }
1476       else if (dfa->edests[org_node].nelem == 1)
1477         {
1478           /* In case of the node can epsilon-transit, and it has only one
1479              destination.  */
1480           org_dest = dfa->edests[org_node].elems[0];
1481           re_node_set_empty (dfa->edests + clone_node);
1482           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1483           /* If the node is root_node itself, it means the epsilon closure
1484              has a loop.  Then tie it to the destination of the root_node.  */
1485           if (org_node == root_node && clone_node != org_node)
1486             {
1487               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1488               if (BE (! ok, 0))
1489                 return REG_ESPACE;
1490               break;
1491             }
1492           /* In case the node has another constraint, append it.  */
1493           constraint |= dfa->nodes[org_node].constraint;
1494           clone_dest = duplicate_node (dfa, org_dest, constraint);
1495           if (BE (clone_dest == REG_MISSING, 0))
1496             return REG_ESPACE;
1497           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1498           if (BE (! ok, 0))
1499             return REG_ESPACE;
1500         }
1501       else /* dfa->edests[org_node].nelem == 2 */
1502         {
1503           /* In case of the node can epsilon-transit, and it has two
1504              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1505           org_dest = dfa->edests[org_node].elems[0];
1506           re_node_set_empty (dfa->edests + clone_node);
1507           /* Search for a duplicated node which satisfies the constraint.  */
1508           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1509           if (clone_dest == REG_MISSING)
1510             {
1511               /* There is no such duplicated node, create a new one.  */
1512               reg_errcode_t err;
1513               clone_dest = duplicate_node (dfa, org_dest, constraint);
1514               if (BE (clone_dest == REG_MISSING, 0))
1515                 return REG_ESPACE;
1516               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517               if (BE (! ok, 0))
1518                 return REG_ESPACE;
1519               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1520                                             root_node, constraint);
1521               if (BE (err != REG_NOERROR, 0))
1522                 return err;
1523             }
1524           else
1525             {
1526               /* There is a duplicated node which satisfy the constraint,
1527                  use it to avoid infinite loop.  */
1528               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529               if (BE (! ok, 0))
1530                 return REG_ESPACE;
1531             }
1532
1533           org_dest = dfa->edests[org_node].elems[1];
1534           clone_dest = duplicate_node (dfa, org_dest, constraint);
1535           if (BE (clone_dest == REG_MISSING, 0))
1536             return REG_ESPACE;
1537           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1538           if (BE (! ok, 0))
1539             return REG_ESPACE;
1540         }
1541       org_node = org_dest;
1542       clone_node = clone_dest;
1543     }
1544   return REG_NOERROR;
1545 }
1546
1547 /* Search for a node which is duplicated from the node ORG_NODE, and
1548    satisfies the constraint CONSTRAINT.  */
1549
1550 static Idx
1551 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1552                         unsigned int constraint)
1553 {
1554   Idx idx;
1555   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1556     {
1557       if (org_node == dfa->org_indices[idx]
1558           && constraint == dfa->nodes[idx].constraint)
1559         return idx; /* Found.  */
1560     }
1561   return REG_MISSING; /* Not found.  */
1562 }
1563
1564 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1565    Return the index of the new node, or REG_MISSING if insufficient storage is
1566    available.  */
1567
1568 static Idx
1569 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1570 {
1571   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1572   if (BE (dup_idx != REG_MISSING, 1))
1573     {
1574       dfa->nodes[dup_idx].constraint = constraint;
1575       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1576       dfa->nodes[dup_idx].duplicated = 1;
1577
1578       /* Store the index of the original node.  */
1579       dfa->org_indices[dup_idx] = org_idx;
1580     }
1581   return dup_idx;
1582 }
1583
1584 static reg_errcode_t
1585 calc_inveclosure (re_dfa_t *dfa)
1586 {
1587   Idx src, idx;
1588   bool ok;
1589   for (idx = 0; idx < dfa->nodes_len; ++idx)
1590     re_node_set_init_empty (dfa->inveclosures + idx);
1591
1592   for (src = 0; src < dfa->nodes_len; ++src)
1593     {
1594       Idx *elems = dfa->eclosures[src].elems;
1595       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1596         {
1597           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1598           if (BE (! ok, 0))
1599             return REG_ESPACE;
1600         }
1601     }
1602
1603   return REG_NOERROR;
1604 }
1605
1606 /* Calculate "eclosure" for all the node in DFA.  */
1607
1608 static reg_errcode_t
1609 calc_eclosure (re_dfa_t *dfa)
1610 {
1611   Idx node_idx;
1612   bool incomplete;
1613 #ifdef DEBUG
1614   assert (dfa->nodes_len > 0);
1615 #endif
1616   incomplete = false;
1617   /* For each nodes, calculate epsilon closure.  */
1618   for (node_idx = 0; ; ++node_idx)
1619     {
1620       reg_errcode_t err;
1621       re_node_set eclosure_elem;
1622       if (node_idx == dfa->nodes_len)
1623         {
1624           if (!incomplete)
1625             break;
1626           incomplete = false;
1627           node_idx = 0;
1628         }
1629
1630 #ifdef DEBUG
1631       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1632 #endif
1633
1634       /* If we have already calculated, skip it.  */
1635       if (dfa->eclosures[node_idx].nelem != 0)
1636         continue;
1637       /* Calculate epsilon closure of `node_idx'.  */
1638       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1639       if (BE (err != REG_NOERROR, 0))
1640         return err;
1641
1642       if (dfa->eclosures[node_idx].nelem == 0)
1643         {
1644           incomplete = true;
1645           re_node_set_free (&eclosure_elem);
1646         }
1647     }
1648   return REG_NOERROR;
1649 }
1650
1651 /* Calculate epsilon closure of NODE.  */
1652
1653 static reg_errcode_t
1654 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1655 {
1656   reg_errcode_t err;
1657   Idx i;
1658   bool incomplete;
1659   bool ok;
1660   re_node_set eclosure;
1661   incomplete = false;
1662   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1663   if (BE (err != REG_NOERROR, 0))
1664     return err;
1665
1666   /* This indicates that we are calculating this node now.
1667      We reference this value to avoid infinite loop.  */
1668   dfa->eclosures[node].nelem = REG_MISSING;
1669
1670   /* If the current node has constraints, duplicate all nodes
1671      since they must inherit the constraints.  */
1672   if (dfa->nodes[node].constraint
1673       && dfa->edests[node].nelem
1674       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1675     {
1676       err = duplicate_node_closure (dfa, node, node, node,
1677                                     dfa->nodes[node].constraint);
1678       if (BE (err != REG_NOERROR, 0))
1679         return err;
1680     }
1681
1682   /* Expand each epsilon destination nodes.  */
1683   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1684     for (i = 0; i < dfa->edests[node].nelem; ++i)
1685       {
1686         re_node_set eclosure_elem;
1687         Idx edest = dfa->edests[node].elems[i];
1688         /* If calculating the epsilon closure of `edest' is in progress,
1689            return intermediate result.  */
1690         if (dfa->eclosures[edest].nelem == REG_MISSING)
1691           {
1692             incomplete = true;
1693             continue;
1694           }
1695         /* If we haven't calculated the epsilon closure of `edest' yet,
1696            calculate now. Otherwise use calculated epsilon closure.  */
1697         if (dfa->eclosures[edest].nelem == 0)
1698           {
1699             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1700             if (BE (err != REG_NOERROR, 0))
1701               return err;
1702           }
1703         else
1704           eclosure_elem = dfa->eclosures[edest];
1705         /* Merge the epsilon closure of `edest'.  */
1706         re_node_set_merge (&eclosure, &eclosure_elem);
1707         /* If the epsilon closure of `edest' is incomplete,
1708            the epsilon closure of this node is also incomplete.  */
1709         if (dfa->eclosures[edest].nelem == 0)
1710           {
1711             incomplete = true;
1712             re_node_set_free (&eclosure_elem);
1713           }
1714       }
1715
1716   /* Epsilon closures include itself.  */
1717   ok = re_node_set_insert (&eclosure, node);
1718   if (BE (! ok, 0))
1719     return REG_ESPACE;
1720   if (incomplete && !root)
1721     dfa->eclosures[node].nelem = 0;
1722   else
1723     dfa->eclosures[node] = eclosure;
1724   *new_set = eclosure;
1725   return REG_NOERROR;
1726 }
1727 \f
1728 /* Functions for token which are used in the parser.  */
1729
1730 /* Fetch a token from INPUT.
1731    We must not use this function inside bracket expressions.  */
1732
1733 static void
1734 internal_function
1735 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1736 {
1737   re_string_skip_bytes (input, peek_token (result, input, syntax));
1738 }
1739
1740 /* Peek a token from INPUT, and return the length of the token.
1741    We must not use this function inside bracket expressions.  */
1742
1743 static int
1744 internal_function
1745 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1746 {
1747   unsigned char c;
1748
1749   if (re_string_eoi (input))
1750     {
1751       token->type = END_OF_RE;
1752       return 0;
1753     }
1754
1755   c = re_string_peek_byte (input, 0);
1756   token->opr.c = c;
1757
1758   token->word_char = 0;
1759 #ifdef RE_ENABLE_I18N
1760   token->mb_partial = 0;
1761   if (input->mb_cur_max > 1 &&
1762       !re_string_first_byte (input, re_string_cur_idx (input)))
1763     {
1764       token->type = CHARACTER;
1765       token->mb_partial = 1;
1766       return 1;
1767     }
1768 #endif
1769   if (c == '\\')
1770     {
1771       unsigned char c2;
1772       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1773         {
1774           token->type = BACK_SLASH;
1775           return 1;
1776         }
1777
1778       c2 = re_string_peek_byte_case (input, 1);
1779       token->opr.c = c2;
1780       token->type = CHARACTER;
1781 #ifdef RE_ENABLE_I18N
1782       if (input->mb_cur_max > 1)
1783         {
1784           wint_t wc = re_string_wchar_at (input,
1785                                           re_string_cur_idx (input) + 1);
1786           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1787         }
1788       else
1789 #endif
1790         token->word_char = IS_WORD_CHAR (c2) != 0;
1791
1792       switch (c2)
1793         {
1794         case '|':
1795           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1796             token->type = OP_ALT;
1797           break;
1798         case '1': case '2': case '3': case '4': case '5':
1799         case '6': case '7': case '8': case '9':
1800           if (!(syntax & RE_NO_BK_REFS))
1801             {
1802               token->type = OP_BACK_REF;
1803               token->opr.idx = c2 - '1';
1804             }
1805           break;
1806         case '<':
1807           if (!(syntax & RE_NO_GNU_OPS))
1808             {
1809               token->type = ANCHOR;
1810               token->opr.ctx_type = WORD_FIRST;
1811             }
1812           break;
1813         case '>':
1814           if (!(syntax & RE_NO_GNU_OPS))
1815             {
1816               token->type = ANCHOR;
1817               token->opr.ctx_type = WORD_LAST;
1818             }
1819           break;
1820         case 'b':
1821           if (!(syntax & RE_NO_GNU_OPS))
1822             {
1823               token->type = ANCHOR;
1824               token->opr.ctx_type = WORD_DELIM;
1825             }
1826           break;
1827         case 'B':
1828           if (!(syntax & RE_NO_GNU_OPS))
1829             {
1830               token->type = ANCHOR;
1831               token->opr.ctx_type = NOT_WORD_DELIM;
1832             }
1833           break;
1834         case 'w':
1835           if (!(syntax & RE_NO_GNU_OPS))
1836             token->type = OP_WORD;
1837           break;
1838         case 'W':
1839           if (!(syntax & RE_NO_GNU_OPS))
1840             token->type = OP_NOTWORD;
1841           break;
1842         case 's':
1843           if (!(syntax & RE_NO_GNU_OPS))
1844             token->type = OP_SPACE;
1845           break;
1846         case 'S':
1847           if (!(syntax & RE_NO_GNU_OPS))
1848             token->type = OP_NOTSPACE;
1849           break;
1850         case '`':
1851           if (!(syntax & RE_NO_GNU_OPS))
1852             {
1853               token->type = ANCHOR;
1854               token->opr.ctx_type = BUF_FIRST;
1855             }
1856           break;
1857         case '\'':
1858           if (!(syntax & RE_NO_GNU_OPS))
1859             {
1860               token->type = ANCHOR;
1861               token->opr.ctx_type = BUF_LAST;
1862             }
1863           break;
1864         case '(':
1865           if (!(syntax & RE_NO_BK_PARENS))
1866             token->type = OP_OPEN_SUBEXP;
1867           break;
1868         case ')':
1869           if (!(syntax & RE_NO_BK_PARENS))
1870             token->type = OP_CLOSE_SUBEXP;
1871           break;
1872         case '+':
1873           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1874             token->type = OP_DUP_PLUS;
1875           break;
1876         case '?':
1877           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1878             token->type = OP_DUP_QUESTION;
1879           break;
1880         case '{':
1881           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1882             token->type = OP_OPEN_DUP_NUM;
1883           break;
1884         case '}':
1885           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1886             token->type = OP_CLOSE_DUP_NUM;
1887           break;
1888         default:
1889           break;
1890         }
1891       return 2;
1892     }
1893
1894   token->type = CHARACTER;
1895 #ifdef RE_ENABLE_I18N
1896   if (input->mb_cur_max > 1)
1897     {
1898       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1899       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1900     }
1901   else
1902 #endif
1903     token->word_char = IS_WORD_CHAR (token->opr.c);
1904
1905   switch (c)
1906     {
1907     case '\n':
1908       if (syntax & RE_NEWLINE_ALT)
1909         token->type = OP_ALT;
1910       break;
1911     case '|':
1912       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1913         token->type = OP_ALT;
1914       break;
1915     case '*':
1916       token->type = OP_DUP_ASTERISK;
1917       break;
1918     case '+':
1919       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1920         token->type = OP_DUP_PLUS;
1921       break;
1922     case '?':
1923       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1924         token->type = OP_DUP_QUESTION;
1925       break;
1926     case '{':
1927       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1928         token->type = OP_OPEN_DUP_NUM;
1929       break;
1930     case '}':
1931       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1932         token->type = OP_CLOSE_DUP_NUM;
1933       break;
1934     case '(':
1935       if (syntax & RE_NO_BK_PARENS)
1936         token->type = OP_OPEN_SUBEXP;
1937       break;
1938     case ')':
1939       if (syntax & RE_NO_BK_PARENS)
1940         token->type = OP_CLOSE_SUBEXP;
1941       break;
1942     case '[':
1943       token->type = OP_OPEN_BRACKET;
1944       break;
1945     case '.':
1946       token->type = OP_PERIOD;
1947       break;
1948     case '^':
1949       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1950           re_string_cur_idx (input) != 0)
1951         {
1952           char prev = re_string_peek_byte (input, -1);
1953           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1954             break;
1955         }
1956       token->type = ANCHOR;
1957       token->opr.ctx_type = LINE_FIRST;
1958       break;
1959     case '$':
1960       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1961           re_string_cur_idx (input) + 1 != re_string_length (input))
1962         {
1963           re_token_t next;
1964           re_string_skip_bytes (input, 1);
1965           peek_token (&next, input, syntax);
1966           re_string_skip_bytes (input, -1);
1967           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1968             break;
1969         }
1970       token->type = ANCHOR;
1971       token->opr.ctx_type = LINE_LAST;
1972       break;
1973     default:
1974       break;
1975     }
1976   return 1;
1977 }
1978
1979 /* Peek a token from INPUT, and return the length of the token.
1980    We must not use this function out of bracket expressions.  */
1981
1982 static int
1983 internal_function
1984 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1985 {
1986   unsigned char c;
1987   if (re_string_eoi (input))
1988     {
1989       token->type = END_OF_RE;
1990       return 0;
1991     }
1992   c = re_string_peek_byte (input, 0);
1993   token->opr.c = c;
1994
1995 #ifdef RE_ENABLE_I18N
1996   if (input->mb_cur_max > 1 &&
1997       !re_string_first_byte (input, re_string_cur_idx (input)))
1998     {
1999       token->type = CHARACTER;
2000       return 1;
2001     }
2002 #endif /* RE_ENABLE_I18N */
2003
2004   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2005       && re_string_cur_idx (input) + 1 < re_string_length (input))
2006     {
2007       /* In this case, '\' escape a character.  */
2008       unsigned char c2;
2009       re_string_skip_bytes (input, 1);
2010       c2 = re_string_peek_byte (input, 0);
2011       token->opr.c = c2;
2012       token->type = CHARACTER;
2013       return 1;
2014     }
2015   if (c == '[') /* '[' is a special char in a bracket exps.  */
2016     {
2017       unsigned char c2;
2018       int token_len;
2019       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2020         c2 = re_string_peek_byte (input, 1);
2021       else
2022         c2 = 0;
2023       token->opr.c = c2;
2024       token_len = 2;
2025       switch (c2)
2026         {
2027         case '.':
2028           token->type = OP_OPEN_COLL_ELEM;
2029           break;
2030         case '=':
2031           token->type = OP_OPEN_EQUIV_CLASS;
2032           break;
2033         case ':':
2034           if (syntax & RE_CHAR_CLASSES)
2035             {
2036               token->type = OP_OPEN_CHAR_CLASS;
2037               break;
2038             }
2039           /* else fall through.  */
2040         default:
2041           token->type = CHARACTER;
2042           token->opr.c = c;
2043           token_len = 1;
2044           break;
2045         }
2046       return token_len;
2047     }
2048   switch (c)
2049     {
2050     case '-':
2051       token->type = OP_CHARSET_RANGE;
2052       break;
2053     case ']':
2054       token->type = OP_CLOSE_BRACKET;
2055       break;
2056     case '^':
2057       token->type = OP_NON_MATCH_LIST;
2058       break;
2059     default:
2060       token->type = CHARACTER;
2061     }
2062   return 1;
2063 }
2064 \f
2065 /* Functions for parser.  */
2066
2067 /* Entry point of the parser.
2068    Parse the regular expression REGEXP and return the structure tree.
2069    If an error is occured, ERR is set by error code, and return NULL.
2070    This function build the following tree, from regular expression <reg_exp>:
2071            CAT
2072            / \
2073           /   \
2074    <reg_exp>  EOR
2075
2076    CAT means concatenation.
2077    EOR means end of regular expression.  */
2078
2079 static bin_tree_t *
2080 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2081        reg_errcode_t *err)
2082 {
2083   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2084   bin_tree_t *tree, *eor, *root;
2085   re_token_t current_token;
2086   dfa->syntax = syntax;
2087   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2088   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2089   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2090     return NULL;
2091   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2092   if (tree != NULL)
2093     root = create_tree (dfa, tree, eor, CONCAT);
2094   else
2095     root = eor;
2096   if (BE (eor == NULL || root == NULL, 0))
2097     {
2098       *err = REG_ESPACE;
2099       return NULL;
2100     }
2101   return root;
2102 }
2103
2104 /* This function build the following tree, from regular expression
2105    <branch1>|<branch2>:
2106            ALT
2107            / \
2108           /   \
2109    <branch1> <branch2>
2110
2111    ALT means alternative, which represents the operator `|'.  */
2112
2113 static bin_tree_t *
2114 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2115                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2116 {
2117   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2118   bin_tree_t *tree, *branch = NULL;
2119   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2120   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2121     return NULL;
2122
2123   while (token->type == OP_ALT)
2124     {
2125       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2126       if (token->type != OP_ALT && token->type != END_OF_RE
2127           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2128         {
2129           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2130           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2131             return NULL;
2132         }
2133       else
2134         branch = NULL;
2135       tree = create_tree (dfa, tree, branch, OP_ALT);
2136       if (BE (tree == NULL, 0))
2137         {
2138           *err = REG_ESPACE;
2139           return NULL;
2140         }
2141     }
2142   return tree;
2143 }
2144
2145 /* This function build the following tree, from regular expression
2146    <exp1><exp2>:
2147         CAT
2148         / \
2149        /   \
2150    <exp1> <exp2>
2151
2152    CAT means concatenation.  */
2153
2154 static bin_tree_t *
2155 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2156               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2157 {
2158   bin_tree_t *tree, *expr;
2159   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2160   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2161   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2162     return NULL;
2163
2164   while (token->type != OP_ALT && token->type != END_OF_RE
2165          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2166     {
2167       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2168       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2169         {
2170           return NULL;
2171         }
2172       if (tree != NULL && expr != NULL)
2173         {
2174           tree = create_tree (dfa, tree, expr, CONCAT);
2175           if (tree == NULL)
2176             {
2177               *err = REG_ESPACE;
2178               return NULL;
2179             }
2180         }
2181       else if (tree == NULL)
2182         tree = expr;
2183       /* Otherwise expr == NULL, we don't need to create new tree.  */
2184     }
2185   return tree;
2186 }
2187
2188 /* This function build the following tree, from regular expression a*:
2189          *
2190          |
2191          a
2192 */
2193
2194 static bin_tree_t *
2195 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2196                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2197 {
2198   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2199   bin_tree_t *tree;
2200   switch (token->type)
2201     {
2202     case CHARACTER:
2203       tree = create_token_tree (dfa, NULL, NULL, token);
2204       if (BE (tree == NULL, 0))
2205         {
2206           *err = REG_ESPACE;
2207           return NULL;
2208         }
2209 #ifdef RE_ENABLE_I18N
2210       if (dfa->mb_cur_max > 1)
2211         {
2212           while (!re_string_eoi (regexp)
2213                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2214             {
2215               bin_tree_t *mbc_remain;
2216               fetch_token (token, regexp, syntax);
2217               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2218               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2219               if (BE (mbc_remain == NULL || tree == NULL, 0))
2220                 {
2221                   *err = REG_ESPACE;
2222                   return NULL;
2223                 }
2224             }
2225         }
2226 #endif
2227       break;
2228     case OP_OPEN_SUBEXP:
2229       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2230       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2231         return NULL;
2232       break;
2233     case OP_OPEN_BRACKET:
2234       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2235       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2236         return NULL;
2237       break;
2238     case OP_BACK_REF:
2239       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2240         {
2241           *err = REG_ESUBREG;
2242           return NULL;
2243         }
2244       dfa->used_bkref_map |= 1 << token->opr.idx;
2245       tree = create_token_tree (dfa, NULL, NULL, token);
2246       if (BE (tree == NULL, 0))
2247         {
2248           *err = REG_ESPACE;
2249           return NULL;
2250         }
2251       ++dfa->nbackref;
2252       dfa->has_mb_node = 1;
2253       break;
2254     case OP_OPEN_DUP_NUM:
2255       if (syntax & RE_CONTEXT_INVALID_DUP)
2256         {
2257           *err = REG_BADRPT;
2258           return NULL;
2259         }
2260       /* FALLTHROUGH */
2261     case OP_DUP_ASTERISK:
2262     case OP_DUP_PLUS:
2263     case OP_DUP_QUESTION:
2264       if (syntax & RE_CONTEXT_INVALID_OPS)
2265         {
2266           *err = REG_BADRPT;
2267           return NULL;
2268         }
2269       else if (syntax & RE_CONTEXT_INDEP_OPS)
2270         {
2271           fetch_token (token, regexp, syntax);
2272           return parse_expression (regexp, preg, token, syntax, nest, err);
2273         }
2274       /* else fall through  */
2275     case OP_CLOSE_SUBEXP:
2276       if ((token->type == OP_CLOSE_SUBEXP) &&
2277           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2278         {
2279           *err = REG_ERPAREN;
2280           return NULL;
2281         }
2282       /* else fall through  */
2283     case OP_CLOSE_DUP_NUM:
2284       /* We treat it as a normal character.  */
2285
2286       /* Then we can these characters as normal characters.  */
2287       token->type = CHARACTER;
2288       /* mb_partial and word_char bits should be initialized already
2289          by peek_token.  */
2290       tree = create_token_tree (dfa, NULL, NULL, token);
2291       if (BE (tree == NULL, 0))
2292         {
2293           *err = REG_ESPACE;
2294           return NULL;
2295         }
2296       break;
2297     case ANCHOR:
2298       if ((token->opr.ctx_type
2299            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2300           && dfa->word_ops_used == 0)
2301         init_word_char (dfa);
2302       if (token->opr.ctx_type == WORD_DELIM
2303           || token->opr.ctx_type == NOT_WORD_DELIM)
2304         {
2305           bin_tree_t *tree_first, *tree_last;
2306           if (token->opr.ctx_type == WORD_DELIM)
2307             {
2308               token->opr.ctx_type = WORD_FIRST;
2309               tree_first = create_token_tree (dfa, NULL, NULL, token);
2310               token->opr.ctx_type = WORD_LAST;
2311             }
2312           else
2313             {
2314               token->opr.ctx_type = INSIDE_WORD;
2315               tree_first = create_token_tree (dfa, NULL, NULL, token);
2316               token->opr.ctx_type = INSIDE_NOTWORD;
2317             }
2318           tree_last = create_token_tree (dfa, NULL, NULL, token);
2319           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2320           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2321             {
2322               *err = REG_ESPACE;
2323               return NULL;
2324             }
2325         }
2326       else
2327         {
2328           tree = create_token_tree (dfa, NULL, NULL, token);
2329           if (BE (tree == NULL, 0))
2330             {
2331               *err = REG_ESPACE;
2332               return NULL;
2333             }
2334         }
2335       /* We must return here, since ANCHORs can't be followed
2336          by repetition operators.
2337          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2338              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2339       fetch_token (token, regexp, syntax);
2340       return tree;
2341     case OP_PERIOD:
2342       tree = create_token_tree (dfa, NULL, NULL, token);
2343       if (BE (tree == NULL, 0))
2344         {
2345           *err = REG_ESPACE;
2346           return NULL;
2347         }
2348       if (dfa->mb_cur_max > 1)
2349         dfa->has_mb_node = 1;
2350       break;
2351     case OP_WORD:
2352     case OP_NOTWORD:
2353       tree = build_charclass_op (dfa, regexp->trans,
2354                                  (const unsigned char *) "alnum",
2355                                  (const unsigned char *) "_",
2356                                  token->type == OP_NOTWORD, err);
2357       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2358         return NULL;
2359       break;
2360     case OP_SPACE:
2361     case OP_NOTSPACE:
2362       tree = build_charclass_op (dfa, regexp->trans,
2363                                  (const unsigned char *) "space",
2364                                  (const unsigned char *) "",
2365                                  token->type == OP_NOTSPACE, err);
2366       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2367         return NULL;
2368       break;
2369     case OP_ALT:
2370     case END_OF_RE:
2371       return NULL;
2372     case BACK_SLASH:
2373       *err = REG_EESCAPE;
2374       return NULL;
2375     default:
2376       /* Must not happen?  */
2377 #ifdef DEBUG
2378       assert (0);
2379 #endif
2380       return NULL;
2381     }
2382   fetch_token (token, regexp, syntax);
2383
2384   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2385          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2386     {
2387       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2388       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389         return NULL;
2390       /* In BRE consecutive duplications are not allowed.  */
2391       if ((syntax & RE_CONTEXT_INVALID_DUP)
2392           && (token->type == OP_DUP_ASTERISK
2393               || token->type == OP_OPEN_DUP_NUM))
2394         {
2395           *err = REG_BADRPT;
2396           return NULL;
2397         }
2398     }
2399
2400   return tree;
2401 }
2402
2403 /* This function build the following tree, from regular expression
2404    (<reg_exp>):
2405          SUBEXP
2406             |
2407         <reg_exp>
2408 */
2409
2410 static bin_tree_t *
2411 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2412                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2413 {
2414   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2415   bin_tree_t *tree;
2416   size_t cur_nsub;
2417   cur_nsub = preg->re_nsub++;
2418
2419   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2420
2421   /* The subexpression may be a null string.  */
2422   if (token->type == OP_CLOSE_SUBEXP)
2423     tree = NULL;
2424   else
2425     {
2426       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2427       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2428         *err = REG_EPAREN;
2429       if (BE (*err != REG_NOERROR, 0))
2430         return NULL;
2431     }
2432
2433   if (cur_nsub <= '9' - '1')
2434     dfa->completed_bkref_map |= 1 << cur_nsub;
2435
2436   tree = create_tree (dfa, tree, NULL, SUBEXP);
2437   if (BE (tree == NULL, 0))
2438     {
2439       *err = REG_ESPACE;
2440       return NULL;
2441     }
2442   tree->token.opr.idx = cur_nsub;
2443   return tree;
2444 }
2445
2446 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2447
2448 static bin_tree_t *
2449 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2450               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2451 {
2452   bin_tree_t *tree = NULL, *old_tree = NULL;
2453   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2454   re_token_t start_token = *token;
2455
2456   if (token->type == OP_OPEN_DUP_NUM)
2457     {
2458       end = 0;
2459       start = fetch_number (regexp, token, syntax);
2460       if (start == REG_MISSING)
2461         {
2462           if (token->type == CHARACTER && token->opr.c == ',')
2463             start = 0; /* We treat "{,m}" as "{0,m}".  */
2464           else
2465             {
2466               *err = REG_BADBR; /* <re>{} is invalid.  */
2467               return NULL;
2468             }
2469         }
2470       if (BE (start != REG_ERROR, 1))
2471         {
2472           /* We treat "{n}" as "{n,n}".  */
2473           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2474                  : ((token->type == CHARACTER && token->opr.c == ',')
2475                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2476         }
2477       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2478         {
2479           /* Invalid sequence.  */
2480           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2481             {
2482               if (token->type == END_OF_RE)
2483                 *err = REG_EBRACE;
2484               else
2485                 *err = REG_BADBR;
2486
2487               return NULL;
2488             }
2489
2490           /* If the syntax bit is set, rollback.  */
2491           re_string_set_index (regexp, start_idx);
2492           *token = start_token;
2493           token->type = CHARACTER;
2494           /* mb_partial and word_char bits should be already initialized by
2495              peek_token.  */
2496           return elem;
2497         }
2498
2499       if (BE (end != REG_MISSING && start > end, 0))
2500         {
2501           /* First number greater than second.  */
2502           *err = REG_BADBR;
2503           return NULL;
2504         }
2505     }
2506   else
2507     {
2508       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2509       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2510     }
2511
2512   fetch_token (token, regexp, syntax);
2513
2514   if (BE (elem == NULL, 0))
2515     return NULL;
2516   if (BE (start == 0 && end == 0, 0))
2517     {
2518       postorder (elem, free_tree, NULL);
2519       return NULL;
2520     }
2521
2522   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2523   if (BE (start > 0, 0))
2524     {
2525       tree = elem;
2526       for (i = 2; i <= start; ++i)
2527         {
2528           elem = duplicate_tree (elem, dfa);
2529           tree = create_tree (dfa, tree, elem, CONCAT);
2530           if (BE (elem == NULL || tree == NULL, 0))
2531             goto parse_dup_op_espace;
2532         }
2533
2534       if (start == end)
2535         return tree;
2536
2537       /* Duplicate ELEM before it is marked optional.  */
2538       elem = duplicate_tree (elem, dfa);
2539       old_tree = tree;
2540     }
2541   else
2542     old_tree = NULL;
2543
2544   if (elem->token.type == SUBEXP)
2545     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2546
2547   tree = create_tree (dfa, elem, NULL,
2548                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2549   if (BE (tree == NULL, 0))
2550     goto parse_dup_op_espace;
2551
2552   /* This loop is actually executed only when end != REG_MISSING,
2553      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2554      already created the start+1-th copy.  */
2555   if ((Idx) -1 < 0 || end != REG_MISSING)
2556     for (i = start + 2; i <= end; ++i)
2557       {
2558         elem = duplicate_tree (elem, dfa);
2559         tree = create_tree (dfa, tree, elem, CONCAT);
2560         if (BE (elem == NULL || tree == NULL, 0))
2561           goto parse_dup_op_espace;
2562
2563         tree = create_tree (dfa, tree, NULL, OP_ALT);
2564         if (BE (tree == NULL, 0))
2565           goto parse_dup_op_espace;
2566       }
2567
2568   if (old_tree)
2569     tree = create_tree (dfa, old_tree, tree, CONCAT);
2570
2571   return tree;
2572
2573  parse_dup_op_espace:
2574   *err = REG_ESPACE;
2575   return NULL;
2576 }
2577
2578 /* Size of the names for collating symbol/equivalence_class/character_class.
2579    I'm not sure, but maybe enough.  */
2580 #define BRACKET_NAME_BUF_SIZE 32
2581
2582 #ifndef _LIBC
2583   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2584      Build the range expression which starts from START_ELEM, and ends
2585      at END_ELEM.  The result are written to MBCSET and SBCSET.
2586      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2587      mbcset->range_ends, is a pointer argument sinse we may
2588      update it.  */
2589
2590 static reg_errcode_t
2591 internal_function
2592 # ifdef RE_ENABLE_I18N
2593 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2594                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2595 # else /* not RE_ENABLE_I18N */
2596 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2597                  bracket_elem_t *end_elem)
2598 # endif /* not RE_ENABLE_I18N */
2599 {
2600   unsigned int start_ch, end_ch;
2601   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2602   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2603           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2604           0))
2605     return REG_ERANGE;
2606
2607   /* We can handle no multi character collating elements without libc
2608      support.  */
2609   if (BE ((start_elem->type == COLL_SYM
2610            && strlen ((char *) start_elem->opr.name) > 1)
2611           || (end_elem->type == COLL_SYM
2612               && strlen ((char *) end_elem->opr.name) > 1), 0))
2613     return REG_ECOLLATE;
2614
2615 # ifdef RE_ENABLE_I18N
2616   {
2617     wchar_t wc;
2618     wint_t start_wc;
2619     wint_t end_wc;
2620     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2621
2622     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2623                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2624                    : 0));
2625     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2626               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2627                  : 0));
2628     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2629                 ? __btowc (start_ch) : start_elem->opr.wch);
2630     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2631               ? __btowc (end_ch) : end_elem->opr.wch);
2632     if (start_wc == WEOF || end_wc == WEOF)
2633       return REG_ECOLLATE;
2634     cmp_buf[0] = start_wc;
2635     cmp_buf[4] = end_wc;
2636     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2637       return REG_ERANGE;
2638
2639     /* Got valid collation sequence values, add them as a new entry.
2640        However, for !_LIBC we have no collation elements: if the
2641        character set is single byte, the single byte character set
2642        that we build below suffices.  parse_bracket_exp passes
2643        no MBCSET if dfa->mb_cur_max == 1.  */
2644     if (mbcset)
2645       {
2646         /* Check the space of the arrays.  */
2647         if (BE (*range_alloc == mbcset->nranges, 0))
2648           {
2649             /* There is not enough space, need realloc.  */
2650             wchar_t *new_array_start, *new_array_end;
2651             Idx new_nranges;
2652
2653             /* +1 in case of mbcset->nranges is 0.  */
2654             new_nranges = 2 * mbcset->nranges + 1;
2655             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2656                are NULL if *range_alloc == 0.  */
2657             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2658                                           new_nranges);
2659             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2660                                         new_nranges);
2661
2662             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2663               return REG_ESPACE;
2664
2665             mbcset->range_starts = new_array_start;
2666             mbcset->range_ends = new_array_end;
2667             *range_alloc = new_nranges;
2668           }
2669
2670         mbcset->range_starts[mbcset->nranges] = start_wc;
2671         mbcset->range_ends[mbcset->nranges++] = end_wc;
2672       }
2673
2674     /* Build the table for single byte characters.  */
2675     for (wc = 0; wc < SBC_MAX; ++wc)
2676       {
2677         cmp_buf[2] = wc;
2678         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2679             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2680           bitset_set (sbcset, wc);
2681       }
2682   }
2683 # else /* not RE_ENABLE_I18N */
2684   {
2685     unsigned int ch;
2686     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2687                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2688                    : 0));
2689     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2690               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2691                  : 0));
2692     if (start_ch > end_ch)
2693       return REG_ERANGE;
2694     /* Build the table for single byte characters.  */
2695     for (ch = 0; ch < SBC_MAX; ++ch)
2696       if (start_ch <= ch  && ch <= end_ch)
2697         bitset_set (sbcset, ch);
2698   }
2699 # endif /* not RE_ENABLE_I18N */
2700   return REG_NOERROR;
2701 }
2702 #endif /* not _LIBC */
2703
2704 #ifndef _LIBC
2705 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2706    Build the collating element which is represented by NAME.
2707    The result are written to MBCSET and SBCSET.
2708    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2709    pointer argument since we may update it.  */
2710
2711 static reg_errcode_t
2712 internal_function
2713 build_collating_symbol (bitset_t sbcset,
2714 # ifdef RE_ENABLE_I18N
2715                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2716 # endif
2717                         const unsigned char *name)
2718 {
2719   size_t name_len = strlen ((const char *) name);
2720   if (BE (name_len != 1, 0))
2721     return REG_ECOLLATE;
2722   else
2723     {
2724       bitset_set (sbcset, name[0]);
2725       return REG_NOERROR;
2726     }
2727 }
2728 #endif /* not _LIBC */
2729
2730 /* This function parse bracket expression like "[abc]", "[a-c]",
2731    "[[.a-a.]]" etc.  */
2732
2733 static bin_tree_t *
2734 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2735                    reg_syntax_t syntax, reg_errcode_t *err)
2736 {
2737 #ifdef _LIBC
2738   const unsigned char *collseqmb;
2739   const char *collseqwc;
2740   uint32_t nrules;
2741   int32_t table_size;
2742   const int32_t *symb_table;
2743   const unsigned char *extra;
2744
2745   /* Local function for parse_bracket_exp used in _LIBC environement.
2746      Seek the collating symbol entry correspondings to NAME.
2747      Return the index of the symbol in the SYMB_TABLE.  */
2748
2749   auto inline int32_t
2750   __attribute ((always_inline))
2751   seek_collating_symbol_entry (name, name_len)
2752          const unsigned char *name;
2753          size_t name_len;
2754     {
2755       int32_t hash = elem_hash ((const char *) name, name_len);
2756       int32_t elem = hash % table_size;
2757       if (symb_table[2 * elem] != 0)
2758         {
2759           int32_t second = hash % (table_size - 2) + 1;
2760
2761           do
2762             {
2763               /* First compare the hashing value.  */
2764               if (symb_table[2 * elem] == hash
2765                   /* Compare the length of the name.  */
2766                   && name_len == extra[symb_table[2 * elem + 1]]
2767                   /* Compare the name.  */
2768                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2769                              name_len) == 0)
2770                 {
2771                   /* Yep, this is the entry.  */
2772                   break;
2773                 }
2774
2775               /* Next entry.  */
2776               elem += second;
2777             }
2778           while (symb_table[2 * elem] != 0);
2779         }
2780       return elem;
2781     }
2782
2783   /* Local function for parse_bracket_exp used in _LIBC environement.
2784      Look up the collation sequence value of BR_ELEM.
2785      Return the value if succeeded, UINT_MAX otherwise.  */
2786
2787   auto inline unsigned int
2788   __attribute ((always_inline))
2789   lookup_collation_sequence_value (br_elem)
2790          bracket_elem_t *br_elem;
2791     {
2792       if (br_elem->type == SB_CHAR)
2793         {
2794           /*
2795           if (MB_CUR_MAX == 1)
2796           */
2797           if (nrules == 0)
2798             return collseqmb[br_elem->opr.ch];
2799           else
2800             {
2801               wint_t wc = __btowc (br_elem->opr.ch);
2802               return __collseq_table_lookup (collseqwc, wc);
2803             }
2804         }
2805       else if (br_elem->type == MB_CHAR)
2806         {
2807           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2808         }
2809       else if (br_elem->type == COLL_SYM)
2810         {
2811           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2812           if (nrules != 0)
2813             {
2814               int32_t elem, idx;
2815               elem = seek_collating_symbol_entry (br_elem->opr.name,
2816                                                   sym_name_len);
2817               if (symb_table[2 * elem] != 0)
2818                 {
2819                   /* We found the entry.  */
2820                   idx = symb_table[2 * elem + 1];
2821                   /* Skip the name of collating element name.  */
2822                   idx += 1 + extra[idx];
2823                   /* Skip the byte sequence of the collating element.  */
2824                   idx += 1 + extra[idx];
2825                   /* Adjust for the alignment.  */
2826                   idx = (idx + 3) & ~3;
2827                   /* Skip the multibyte collation sequence value.  */
2828                   idx += sizeof (unsigned int);
2829                   /* Skip the wide char sequence of the collating element.  */
2830                   idx += sizeof (unsigned int) *
2831                     (1 + *(unsigned int *) (extra + idx));
2832                   /* Return the collation sequence value.  */
2833                   return *(unsigned int *) (extra + idx);
2834                 }
2835               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2836                 {
2837                   /* No valid character.  Match it as a single byte
2838                      character.  */
2839                   return collseqmb[br_elem->opr.name[0]];
2840                 }
2841             }
2842           else if (sym_name_len == 1)
2843             return collseqmb[br_elem->opr.name[0]];
2844         }
2845       return UINT_MAX;
2846     }
2847
2848   /* Local function for parse_bracket_exp used in _LIBC environement.
2849      Build the range expression which starts from START_ELEM, and ends
2850      at END_ELEM.  The result are written to MBCSET and SBCSET.
2851      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2852      mbcset->range_ends, is a pointer argument sinse we may
2853      update it.  */
2854
2855   auto inline reg_errcode_t
2856   __attribute ((always_inline))
2857   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2858          re_charset_t *mbcset;
2859          Idx *range_alloc;
2860          bitset_t sbcset;
2861          bracket_elem_t *start_elem, *end_elem;
2862     {
2863       unsigned int ch;
2864       uint32_t start_collseq;
2865       uint32_t end_collseq;
2866
2867       /* Equivalence Classes and Character Classes can't be a range
2868          start/end.  */
2869       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2870               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2871               0))
2872         return REG_ERANGE;
2873
2874       start_collseq = lookup_collation_sequence_value (start_elem);
2875       end_collseq = lookup_collation_sequence_value (end_elem);
2876       /* Check start/end collation sequence values.  */
2877       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2878         return REG_ECOLLATE;
2879       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2880         return REG_ERANGE;
2881
2882       /* Got valid collation sequence values, add them as a new entry.
2883          However, if we have no collation elements, and the character set
2884          is single byte, the single byte character set that we
2885          build below suffices. */
2886       if (nrules > 0 || dfa->mb_cur_max > 1)
2887         {
2888           /* Check the space of the arrays.  */
2889           if (BE (*range_alloc == mbcset->nranges, 0))
2890             {
2891               /* There is not enough space, need realloc.  */
2892               uint32_t *new_array_start;
2893               uint32_t *new_array_end;
2894               Idx new_nranges;
2895
2896               /* +1 in case of mbcset->nranges is 0.  */
2897               new_nranges = 2 * mbcset->nranges + 1;
2898               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2899                                             new_nranges);
2900               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2901                                           new_nranges);
2902
2903               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2904                 return REG_ESPACE;
2905
2906               mbcset->range_starts = new_array_start;
2907               mbcset->range_ends = new_array_end;
2908               *range_alloc = new_nranges;
2909             }
2910
2911           mbcset->range_starts[mbcset->nranges] = start_collseq;
2912           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2913         }
2914
2915       /* Build the table for single byte characters.  */
2916       for (ch = 0; ch < SBC_MAX; ch++)
2917         {
2918           uint32_t ch_collseq;
2919           /*
2920           if (MB_CUR_MAX == 1)
2921           */
2922           if (nrules == 0)
2923             ch_collseq = collseqmb[ch];
2924           else
2925             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2926           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2927             bitset_set (sbcset, ch);
2928         }
2929       return REG_NOERROR;
2930     }
2931
2932   /* Local function for parse_bracket_exp used in _LIBC environement.
2933      Build the collating element which is represented by NAME.
2934      The result are written to MBCSET and SBCSET.
2935      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2936      pointer argument sinse we may update it.  */
2937
2938   auto inline reg_errcode_t
2939   __attribute ((always_inline))
2940   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2941          re_charset_t *mbcset;
2942          Idx *coll_sym_alloc;
2943          bitset_t sbcset;
2944          const unsigned char *name;
2945     {
2946       int32_t elem, idx;
2947       size_t name_len = strlen ((const char *) name);
2948       if (nrules != 0)
2949         {
2950           elem = seek_collating_symbol_entry (name, name_len);
2951           if (symb_table[2 * elem] != 0)
2952             {
2953               /* We found the entry.  */
2954               idx = symb_table[2 * elem + 1];
2955               /* Skip the name of collating element name.  */
2956               idx += 1 + extra[idx];
2957             }
2958           else if (symb_table[2 * elem] == 0 && name_len == 1)
2959             {
2960               /* No valid character, treat it as a normal
2961                  character.  */
2962               bitset_set (sbcset, name[0]);
2963               return REG_NOERROR;
2964             }
2965           else
2966             return REG_ECOLLATE;
2967
2968           /* Got valid collation sequence, add it as a new entry.  */
2969           /* Check the space of the arrays.  */
2970           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2971             {
2972               /* Not enough, realloc it.  */
2973               /* +1 in case of mbcset->ncoll_syms is 0.  */
2974               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2975               /* Use realloc since mbcset->coll_syms is NULL
2976                  if *alloc == 0.  */
2977               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2978                                                    new_coll_sym_alloc);
2979               if (BE (new_coll_syms == NULL, 0))
2980                 return REG_ESPACE;
2981               mbcset->coll_syms = new_coll_syms;
2982               *coll_sym_alloc = new_coll_sym_alloc;
2983             }
2984           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2985           return REG_NOERROR;
2986         }
2987       else
2988         {
2989           if (BE (name_len != 1, 0))
2990             return REG_ECOLLATE;
2991           else
2992             {
2993               bitset_set (sbcset, name[0]);
2994               return REG_NOERROR;
2995             }
2996         }
2997     }
2998 #endif
2999
3000   re_token_t br_token;
3001   re_bitset_ptr_t sbcset;
3002 #ifdef RE_ENABLE_I18N
3003   re_charset_t *mbcset;
3004   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3005   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3006 #endif /* not RE_ENABLE_I18N */
3007   bool non_match = false;
3008   bin_tree_t *work_tree;
3009   int token_len;
3010   bool first_round = true;
3011 #ifdef _LIBC
3012   collseqmb = (const unsigned char *)
3013     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3014   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3015   if (nrules)
3016     {
3017       /*
3018       if (MB_CUR_MAX > 1)
3019       */
3020       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3021       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3022       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3023                                                   _NL_COLLATE_SYMB_TABLEMB);
3024       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3025                                                    _NL_COLLATE_SYMB_EXTRAMB);
3026     }
3027 #endif
3028   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3029 #ifdef RE_ENABLE_I18N
3030   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3031 #endif /* RE_ENABLE_I18N */
3032 #ifdef RE_ENABLE_I18N
3033   if (BE (sbcset == NULL || mbcset == NULL, 0))
3034 #else
3035   if (BE (sbcset == NULL, 0))
3036 #endif /* RE_ENABLE_I18N */
3037     {
3038       *err = REG_ESPACE;
3039       return NULL;
3040     }
3041
3042   token_len = peek_token_bracket (token, regexp, syntax);
3043   if (BE (token->type == END_OF_RE, 0))
3044     {
3045       *err = REG_BADPAT;
3046       goto parse_bracket_exp_free_return;
3047     }
3048   if (token->type == OP_NON_MATCH_LIST)
3049     {
3050 #ifdef RE_ENABLE_I18N
3051       mbcset->non_match = 1;
3052 #endif /* not RE_ENABLE_I18N */
3053       non_match = true;
3054       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3055         bitset_set (sbcset, '\n');
3056       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3057       token_len = peek_token_bracket (token, regexp, syntax);
3058       if (BE (token->type == END_OF_RE, 0))
3059         {
3060           *err = REG_BADPAT;
3061           goto parse_bracket_exp_free_return;
3062         }
3063     }
3064
3065   /* We treat the first ']' as a normal character.  */
3066   if (token->type == OP_CLOSE_BRACKET)
3067     token->type = CHARACTER;
3068
3069   while (1)
3070     {
3071       bracket_elem_t start_elem, end_elem;
3072       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3073       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3074       reg_errcode_t ret;
3075       int token_len2 = 0;
3076       bool is_range_exp = false;
3077       re_token_t token2;
3078
3079       start_elem.opr.name = start_name_buf;
3080       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3081                                    syntax, first_round);
3082       if (BE (ret != REG_NOERROR, 0))
3083         {
3084           *err = ret;
3085           goto parse_bracket_exp_free_return;
3086         }
3087       first_round = false;
3088
3089       /* Get information about the next token.  We need it in any case.  */
3090       token_len = peek_token_bracket (token, regexp, syntax);
3091
3092       /* Do not check for ranges if we know they are not allowed.  */
3093       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3094         {
3095           if (BE (token->type == END_OF_RE, 0))
3096             {
3097               *err = REG_EBRACK;
3098               goto parse_bracket_exp_free_return;
3099             }
3100           if (token->type == OP_CHARSET_RANGE)
3101             {
3102               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3103               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3104               if (BE (token2.type == END_OF_RE, 0))
3105                 {
3106                   *err = REG_EBRACK;
3107                   goto parse_bracket_exp_free_return;
3108                 }
3109               if (token2.type == OP_CLOSE_BRACKET)
3110                 {
3111                   /* We treat the last '-' as a normal character.  */
3112                   re_string_skip_bytes (regexp, -token_len);
3113                   token->type = CHARACTER;
3114                 }
3115               else
3116                 is_range_exp = true;
3117             }
3118         }
3119
3120       if (is_range_exp == true)
3121         {
3122           end_elem.opr.name = end_name_buf;
3123           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3124                                        dfa, syntax, true);
3125           if (BE (ret != REG_NOERROR, 0))
3126             {
3127               *err = ret;
3128               goto parse_bracket_exp_free_return;
3129             }
3130
3131           token_len = peek_token_bracket (token, regexp, syntax);
3132
3133 #ifdef _LIBC
3134           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3135                                   &start_elem, &end_elem);
3136 #else
3137 # ifdef RE_ENABLE_I18N
3138           *err = build_range_exp (sbcset,
3139                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3140                                   &range_alloc, &start_elem, &end_elem);
3141 # else
3142           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3143 # endif
3144 #endif /* RE_ENABLE_I18N */
3145           if (BE (*err != REG_NOERROR, 0))
3146             goto parse_bracket_exp_free_return;
3147         }
3148       else
3149         {
3150           switch (start_elem.type)
3151             {
3152             case SB_CHAR:
3153               bitset_set (sbcset, start_elem.opr.ch);
3154               break;
3155 #ifdef RE_ENABLE_I18N
3156             case MB_CHAR:
3157               /* Check whether the array has enough space.  */
3158               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3159                 {
3160                   wchar_t *new_mbchars;
3161                   /* Not enough, realloc it.  */
3162                   /* +1 in case of mbcset->nmbchars is 0.  */
3163                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3164                   /* Use realloc since array is NULL if *alloc == 0.  */
3165                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3166                                             mbchar_alloc);
3167                   if (BE (new_mbchars == NULL, 0))
3168                     goto parse_bracket_exp_espace;
3169                   mbcset->mbchars = new_mbchars;
3170                 }
3171               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3172               break;
3173 #endif /* RE_ENABLE_I18N */
3174             case EQUIV_CLASS:
3175               *err = build_equiv_class (sbcset,
3176 #ifdef RE_ENABLE_I18N
3177                                         mbcset, &equiv_class_alloc,
3178 #endif /* RE_ENABLE_I18N */
3179                                         start_elem.opr.name);
3180               if (BE (*err != REG_NOERROR, 0))
3181                 goto parse_bracket_exp_free_return;
3182               break;
3183             case COLL_SYM:
3184               *err = build_collating_symbol (sbcset,
3185 #ifdef RE_ENABLE_I18N
3186                                              mbcset, &coll_sym_alloc,
3187 #endif /* RE_ENABLE_I18N */
3188                                              start_elem.opr.name);
3189               if (BE (*err != REG_NOERROR, 0))
3190                 goto parse_bracket_exp_free_return;
3191               break;
3192             case CHAR_CLASS:
3193               *err = build_charclass (regexp->trans, sbcset,
3194 #ifdef RE_ENABLE_I18N
3195                                       mbcset, &char_class_alloc,
3196 #endif /* RE_ENABLE_I18N */
3197                                       start_elem.opr.name, syntax);
3198               if (BE (*err != REG_NOERROR, 0))
3199                goto parse_bracket_exp_free_return;
3200               break;
3201             default:
3202               assert (0);
3203               break;
3204             }
3205         }
3206       if (BE (token->type == END_OF_RE, 0))
3207         {
3208           *err = REG_EBRACK;
3209           goto parse_bracket_exp_free_return;
3210         }
3211       if (token->type == OP_CLOSE_BRACKET)
3212         break;
3213     }
3214
3215   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3216
3217   /* If it is non-matching list.  */
3218   if (non_match)
3219     bitset_not (sbcset);
3220
3221 #ifdef RE_ENABLE_I18N
3222   /* Ensure only single byte characters are set.  */
3223   if (dfa->mb_cur_max > 1)
3224     bitset_mask (sbcset, dfa->sb_char);
3225
3226   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3227       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3228                                                      || mbcset->non_match)))
3229     {
3230       bin_tree_t *mbc_tree;
3231       int sbc_idx;
3232       /* Build a tree for complex bracket.  */
3233       dfa->has_mb_node = 1;
3234       br_token.type = COMPLEX_BRACKET;
3235       br_token.opr.mbcset = mbcset;
3236       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3237       if (BE (mbc_tree == NULL, 0))
3238         goto parse_bracket_exp_espace;
3239       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3240         if (sbcset[sbc_idx])
3241           break;
3242       /* If there are no bits set in sbcset, there is no point
3243          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3244       if (sbc_idx < BITSET_WORDS)
3245         {
3246           /* Build a tree for simple bracket.  */
3247           br_token.type = SIMPLE_BRACKET;
3248           br_token.opr.sbcset = sbcset;
3249           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3250           if (BE (work_tree == NULL, 0))
3251             goto parse_bracket_exp_espace;
3252
3253           /* Then join them by ALT node.  */
3254           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3255           if (BE (work_tree == NULL, 0))
3256             goto parse_bracket_exp_espace;
3257         }
3258       else
3259         {
3260           re_free (sbcset);
3261           work_tree = mbc_tree;
3262         }
3263     }
3264   else
3265 #endif /* not RE_ENABLE_I18N */
3266     {
3267 #ifdef RE_ENABLE_I18N
3268       free_charset (mbcset);
3269 #endif
3270       /* Build a tree for simple bracket.  */
3271       br_token.type = SIMPLE_BRACKET;
3272       br_token.opr.sbcset = sbcset;
3273       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3274       if (BE (work_tree == NULL, 0))
3275         goto parse_bracket_exp_espace;
3276     }
3277   return work_tree;
3278
3279  parse_bracket_exp_espace:
3280   *err = REG_ESPACE;
3281  parse_bracket_exp_free_return:
3282   re_free (sbcset);
3283 #ifdef RE_ENABLE_I18N
3284   free_charset (mbcset);
3285 #endif /* RE_ENABLE_I18N */
3286   return NULL;
3287 }
3288
3289 /* Parse an element in the bracket expression.  */
3290
3291 static reg_errcode_t
3292 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3293                        re_token_t *token, int token_len, re_dfa_t *dfa,
3294                        reg_syntax_t syntax, bool accept_hyphen)
3295 {
3296 #ifdef RE_ENABLE_I18N
3297   int cur_char_size;
3298   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3299   if (cur_char_size > 1)
3300     {
3301       elem->type = MB_CHAR;
3302       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3303       re_string_skip_bytes (regexp, cur_char_size);
3304       return REG_NOERROR;
3305     }
3306 #endif /* RE_ENABLE_I18N */
3307   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3308   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3309       || token->type == OP_OPEN_EQUIV_CLASS)
3310     return parse_bracket_symbol (elem, regexp, token);
3311   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3312     {
3313       /* A '-' must only appear as anything but a range indicator before
3314          the closing bracket.  Everything else is an error.  */
3315       re_token_t token2;
3316       (void) peek_token_bracket (&token2, regexp, syntax);
3317       if (token2.type != OP_CLOSE_BRACKET)
3318         /* The actual error value is not standardized since this whole
3319            case is undefined.  But ERANGE makes good sense.  */
3320         return REG_ERANGE;
3321     }
3322   elem->type = SB_CHAR;
3323   elem->opr.ch = token->opr.c;
3324   return REG_NOERROR;
3325 }
3326
3327 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3328    such as [:<character_class>:], [.<collating_element>.], and
3329    [=<equivalent_class>=].  */
3330
3331 static reg_errcode_t
3332 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3333                       re_token_t *token)
3334 {
3335   unsigned char ch, delim = token->opr.c;
3336   int i = 0;
3337   if (re_string_eoi(regexp))
3338     return REG_EBRACK;
3339   for (;; ++i)
3340     {
3341       if (i >= BRACKET_NAME_BUF_SIZE)
3342         return REG_EBRACK;
3343       if (token->type == OP_OPEN_CHAR_CLASS)
3344         ch = re_string_fetch_byte_case (regexp);
3345       else
3346         ch = re_string_fetch_byte (regexp);
3347       if (re_string_eoi(regexp))
3348         return REG_EBRACK;
3349       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3350         break;
3351       elem->opr.name[i] = ch;
3352     }
3353   re_string_skip_bytes (regexp, 1);
3354   elem->opr.name[i] = '\0';
3355   switch (token->type)
3356     {
3357     case OP_OPEN_COLL_ELEM:
3358       elem->type = COLL_SYM;
3359       break;
3360     case OP_OPEN_EQUIV_CLASS:
3361       elem->type = EQUIV_CLASS;
3362       break;
3363     case OP_OPEN_CHAR_CLASS:
3364       elem->type = CHAR_CLASS;
3365       break;
3366     default:
3367       break;
3368     }
3369   return REG_NOERROR;
3370 }
3371
3372   /* Helper function for parse_bracket_exp.
3373      Build the equivalence class which is represented by NAME.
3374      The result are written to MBCSET and SBCSET.
3375      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3376      is a pointer argument sinse we may update it.  */
3377
3378 static reg_errcode_t
3379 #ifdef RE_ENABLE_I18N
3380 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3381                    Idx *equiv_class_alloc, const unsigned char *name)
3382 #else /* not RE_ENABLE_I18N */
3383 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3384 #endif /* not RE_ENABLE_I18N */
3385 {
3386 #ifdef _LIBC
3387   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3388   if (nrules != 0)
3389     {
3390       const int32_t *table, *indirect;
3391       const unsigned char *weights, *extra, *cp;
3392       unsigned char char_buf[2];
3393       int32_t idx1, idx2;
3394       unsigned int ch;
3395       size_t len;
3396       /* This #include defines a local function!  */
3397 # include <locale/weight.h>
3398       /* Calculate the index for equivalence class.  */
3399       cp = name;
3400       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3401       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3402                                                _NL_COLLATE_WEIGHTMB);
3403       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3404                                                    _NL_COLLATE_EXTRAMB);
3405       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3406                                                 _NL_COLLATE_INDIRECTMB);
3407       idx1 = findidx (&cp);
3408       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3409         /* This isn't a valid character.  */
3410         return REG_ECOLLATE;
3411
3412       /* Build single byte matcing table for this equivalence class.  */
3413       char_buf[1] = (unsigned char) '\0';
3414       len = weights[idx1];
3415       for (ch = 0; ch < SBC_MAX; ++ch)
3416         {
3417           char_buf[0] = ch;
3418           cp = char_buf;
3419           idx2 = findidx (&cp);
3420 /*
3421           idx2 = table[ch];
3422 */
3423           if (idx2 == 0)
3424             /* This isn't a valid character.  */
3425             continue;
3426           if (len == weights[idx2])
3427             {
3428               int cnt = 0;
3429               while (cnt <= len &&
3430                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3431                 ++cnt;
3432
3433               if (cnt > len)
3434                 bitset_set (sbcset, ch);
3435             }
3436         }
3437       /* Check whether the array has enough space.  */
3438       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3439         {
3440           /* Not enough, realloc it.  */
3441           /* +1 in case of mbcset->nequiv_classes is 0.  */
3442           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3443           /* Use realloc since the array is NULL if *alloc == 0.  */
3444           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3445                                                    int32_t,
3446                                                    new_equiv_class_alloc);
3447           if (BE (new_equiv_classes == NULL, 0))
3448             return REG_ESPACE;
3449           mbcset->equiv_classes = new_equiv_classes;
3450           *equiv_class_alloc = new_equiv_class_alloc;
3451         }
3452       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3453     }
3454   else
3455 #endif /* _LIBC */
3456     {
3457       if (BE (strlen ((const char *) name) != 1, 0))
3458         return REG_ECOLLATE;
3459       bitset_set (sbcset, *name);
3460     }
3461   return REG_NOERROR;
3462 }
3463
3464   /* Helper function for parse_bracket_exp.
3465      Build the character class which is represented by NAME.
3466      The result are written to MBCSET and SBCSET.
3467      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3468      is a pointer argument sinse we may update it.  */
3469
3470 static reg_errcode_t
3471 #ifdef RE_ENABLE_I18N
3472 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3473                  re_charset_t *mbcset, Idx *char_class_alloc,
3474                  const unsigned char *class_name, reg_syntax_t syntax)
3475 #else /* not RE_ENABLE_I18N */
3476 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3477                  const unsigned char *class_name, reg_syntax_t syntax)
3478 #endif /* not RE_ENABLE_I18N */
3479 {
3480   int i;
3481   const char *name = (const char *) class_name;
3482
3483   /* In case of REG_ICASE "upper" and "lower" match the both of
3484      upper and lower cases.  */
3485   if ((syntax & RE_ICASE)
3486       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3487     name = "alpha";
3488
3489 #ifdef RE_ENABLE_I18N
3490   /* Check the space of the arrays.  */
3491   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3492     {
3493       /* Not enough, realloc it.  */
3494       /* +1 in case of mbcset->nchar_classes is 0.  */
3495       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3496       /* Use realloc since array is NULL if *alloc == 0.  */
3497       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3498                                                new_char_class_alloc);
3499       if (BE (new_char_classes == NULL, 0))
3500         return REG_ESPACE;
3501       mbcset->char_classes = new_char_classes;
3502       *char_class_alloc = new_char_class_alloc;
3503     }
3504   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3505 #endif /* RE_ENABLE_I18N */
3506
3507 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3508   do {                                          \
3509     if (BE (trans != NULL, 0))                  \
3510       {                                         \
3511         for (i = 0; i < SBC_MAX; ++i)           \
3512           if (ctype_func (i))                   \
3513             bitset_set (sbcset, trans[i]);      \
3514       }                                         \
3515     else                                        \
3516       {                                         \
3517         for (i = 0; i < SBC_MAX; ++i)           \
3518           if (ctype_func (i))                   \
3519             bitset_set (sbcset, i);             \
3520       }                                         \
3521   } while (0)
3522
3523   if (strcmp (name, "alnum") == 0)
3524     BUILD_CHARCLASS_LOOP (isalnum);
3525   else if (strcmp (name, "cntrl") == 0)
3526     BUILD_CHARCLASS_LOOP (iscntrl);
3527   else if (strcmp (name, "lower") == 0)
3528     BUILD_CHARCLASS_LOOP (islower);
3529   else if (strcmp (name, "space") == 0)
3530     BUILD_CHARCLASS_LOOP (isspace);
3531   else if (strcmp (name, "alpha") == 0)
3532     BUILD_CHARCLASS_LOOP (isalpha);
3533   else if (strcmp (name, "digit") == 0)
3534     BUILD_CHARCLASS_LOOP (isdigit);
3535   else if (strcmp (name, "print") == 0)
3536     BUILD_CHARCLASS_LOOP (isprint);
3537   else if (strcmp (name, "upper") == 0)
3538     BUILD_CHARCLASS_LOOP (isupper);
3539   else if (strcmp (name, "blank") == 0)
3540     BUILD_CHARCLASS_LOOP (isblank);
3541   else if (strcmp (name, "graph") == 0)
3542     BUILD_CHARCLASS_LOOP (isgraph);
3543   else if (strcmp (name, "punct") == 0)
3544     BUILD_CHARCLASS_LOOP (ispunct);
3545   else if (strcmp (name, "xdigit") == 0)
3546     BUILD_CHARCLASS_LOOP (isxdigit);
3547   else
3548     return REG_ECTYPE;
3549
3550   return REG_NOERROR;
3551 }
3552
3553 static bin_tree_t *
3554 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3555                     const unsigned char *class_name,
3556                     const unsigned char *extra, bool non_match,
3557                     reg_errcode_t *err)
3558 {
3559   re_bitset_ptr_t sbcset;
3560 #ifdef RE_ENABLE_I18N
3561   re_charset_t *mbcset;
3562   Idx alloc = 0;
3563 #endif /* not RE_ENABLE_I18N */
3564   reg_errcode_t ret;
3565   re_token_t br_token;
3566   bin_tree_t *tree;
3567
3568   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3569 #ifdef RE_ENABLE_I18N
3570   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3571 #endif /* RE_ENABLE_I18N */
3572
3573 #ifdef RE_ENABLE_I18N
3574   if (BE (sbcset == NULL || mbcset == NULL, 0))
3575 #else /* not RE_ENABLE_I18N */
3576   if (BE (sbcset == NULL, 0))
3577 #endif /* not RE_ENABLE_I18N */
3578     {
3579       *err = REG_ESPACE;
3580       return NULL;
3581     }
3582
3583   if (non_match)
3584     {
3585 #ifdef RE_ENABLE_I18N
3586       mbcset->non_match = 1;
3587 #endif /* not RE_ENABLE_I18N */
3588     }
3589
3590   /* We don't care the syntax in this case.  */
3591   ret = build_charclass (trans, sbcset,
3592 #ifdef RE_ENABLE_I18N
3593                          mbcset, &alloc,
3594 #endif /* RE_ENABLE_I18N */
3595                          class_name, 0);
3596
3597   if (BE (ret != REG_NOERROR, 0))
3598     {
3599       re_free (sbcset);
3600 #ifdef RE_ENABLE_I18N
3601       free_charset (mbcset);
3602 #endif /* RE_ENABLE_I18N */
3603       *err = ret;
3604       return NULL;
3605     }
3606   /* \w match '_' also.  */
3607   for (; *extra; extra++)
3608     bitset_set (sbcset, *extra);
3609
3610   /* If it is non-matching list.  */
3611   if (non_match)
3612     bitset_not (sbcset);
3613
3614 #ifdef RE_ENABLE_I18N
3615   /* Ensure only single byte characters are set.  */
3616   if (dfa->mb_cur_max > 1)
3617     bitset_mask (sbcset, dfa->sb_char);
3618 #endif
3619
3620   /* Build a tree for simple bracket.  */
3621   br_token.type = SIMPLE_BRACKET;
3622   br_token.opr.sbcset = sbcset;
3623   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3624   if (BE (tree == NULL, 0))
3625     goto build_word_op_espace;
3626
3627 #ifdef RE_ENABLE_I18N
3628   if (dfa->mb_cur_max > 1)
3629     {
3630       bin_tree_t *mbc_tree;
3631       /* Build a tree for complex bracket.  */
3632       br_token.type = COMPLEX_BRACKET;
3633       br_token.opr.mbcset = mbcset;
3634       dfa->has_mb_node = 1;
3635       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3636       if (BE (mbc_tree == NULL, 0))
3637         goto build_word_op_espace;
3638       /* Then join them by ALT node.  */
3639       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3640       if (BE (mbc_tree != NULL, 1))
3641         return tree;
3642     }
3643   else
3644     {
3645       free_charset (mbcset);
3646       return tree;
3647     }
3648 #else /* not RE_ENABLE_I18N */
3649   return tree;
3650 #endif /* not RE_ENABLE_I18N */
3651
3652  build_word_op_espace:
3653   re_free (sbcset);
3654 #ifdef RE_ENABLE_I18N
3655   free_charset (mbcset);
3656 #endif /* RE_ENABLE_I18N */
3657   *err = REG_ESPACE;
3658   return NULL;
3659 }
3660
3661 /* This is intended for the expressions like "a{1,3}".
3662    Fetch a number from `input', and return the number.
3663    Return REG_MISSING if the number field is empty like "{,1}".
3664    Return REG_ERROR if an error occurred.  */
3665
3666 static Idx
3667 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3668 {
3669   Idx num = REG_MISSING;
3670   unsigned char c;
3671   while (1)
3672     {
3673       fetch_token (token, input, syntax);
3674       c = token->opr.c;
3675       if (BE (token->type == END_OF_RE, 0))
3676         return REG_ERROR;
3677       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3678         break;
3679       num = ((token->type != CHARACTER || c < '0' || '9' < c
3680               || num == REG_ERROR)
3681              ? REG_ERROR
3682              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3683       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3684     }
3685   return num;
3686 }
3687 \f
3688 #ifdef RE_ENABLE_I18N
3689 static void
3690 free_charset (re_charset_t *cset)
3691 {
3692   re_free (cset->mbchars);
3693 # ifdef _LIBC
3694   re_free (cset->coll_syms);
3695   re_free (cset->equiv_classes);
3696   re_free (cset->range_starts);
3697   re_free (cset->range_ends);
3698 # endif
3699   re_free (cset->char_classes);
3700   re_free (cset);
3701 }
3702 #endif /* RE_ENABLE_I18N */
3703 \f
3704 /* Functions for binary tree operation.  */
3705
3706 /* Create a tree node.  */
3707
3708 static bin_tree_t *
3709 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3710              re_token_type_t type)
3711 {
3712   re_token_t t;
3713   t.type = type;
3714   return create_token_tree (dfa, left, right, &t);
3715 }
3716
3717 static bin_tree_t *
3718 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3719                    const re_token_t *token)
3720 {
3721   bin_tree_t *tree;
3722   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3723     {
3724       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3725
3726       if (storage == NULL)
3727         return NULL;
3728       storage->next = dfa->str_tree_storage;
3729       dfa->str_tree_storage = storage;
3730       dfa->str_tree_storage_idx = 0;
3731     }
3732   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3733
3734   tree->parent = NULL;
3735   tree->left = left;
3736   tree->right = right;
3737   tree->token = *token;
3738   tree->token.duplicated = 0;
3739   tree->token.opt_subexp = 0;
3740   tree->first = NULL;
3741   tree->next = NULL;
3742   tree->node_idx = REG_MISSING;
3743
3744   if (left != NULL)
3745     left->parent = tree;
3746   if (right != NULL)
3747     right->parent = tree;
3748   return tree;
3749 }
3750
3751 /* Mark the tree SRC as an optional subexpression.
3752    To be called from preorder or postorder.  */
3753
3754 static reg_errcode_t
3755 mark_opt_subexp (void *extra, bin_tree_t *node)
3756 {
3757   Idx idx = (Idx) (long) extra;
3758   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3759     node->token.opt_subexp = 1;
3760
3761   return REG_NOERROR;
3762 }
3763
3764 /* Free the allocated memory inside NODE. */
3765
3766 static void
3767 free_token (re_token_t *node)
3768 {
3769 #ifdef RE_ENABLE_I18N
3770   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3771     free_charset (node->opr.mbcset);
3772   else
3773 #endif /* RE_ENABLE_I18N */
3774     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3775       re_free (node->opr.sbcset);
3776 }
3777
3778 /* Worker function for tree walking.  Free the allocated memory inside NODE
3779    and its children. */
3780
3781 static reg_errcode_t
3782 free_tree (void *extra, bin_tree_t *node)
3783 {
3784   free_token (&node->token);
3785   return REG_NOERROR;
3786 }
3787
3788
3789 /* Duplicate the node SRC, and return new node.  This is a preorder
3790    visit similar to the one implemented by the generic visitor, but
3791    we need more infrastructure to maintain two parallel trees --- so,
3792    it's easier to duplicate.  */
3793
3794 static bin_tree_t *
3795 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3796 {
3797   const bin_tree_t *node;
3798   bin_tree_t *dup_root;
3799   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3800
3801   for (node = root; ; )
3802     {
3803       /* Create a new tree and link it back to the current parent.  */
3804       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3805       if (*p_new == NULL)
3806         return NULL;
3807       (*p_new)->parent = dup_node;
3808       (*p_new)->token.duplicated = 1;
3809       dup_node = *p_new;
3810
3811       /* Go to the left node, or up and to the right.  */
3812       if (node->left)
3813         {
3814           node = node->left;
3815           p_new = &dup_node->left;
3816         }
3817       else
3818         {
3819           const bin_tree_t *prev = NULL;
3820           while (node->right == prev || node->right == NULL)
3821             {
3822               prev = node;
3823               node = node->parent;
3824               dup_node = dup_node->parent;
3825               if (!node)
3826                 return dup_root;
3827             }
3828           node = node->right;
3829           p_new = &dup_node->right;
3830         }
3831     }
3832 }