git.cworth.org Git - tar/blob - gnu/regcomp.c

   1 /* -*- buffer-read-only: t -*- vi: set ro: */
   2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
   3 /* Extended regular expression matching and search library.
   4    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
   5    Software Foundation, Inc.
   6    This file is part of the GNU C Library.
   7    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3, or (at your option)
  12    any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License along
  20    with this program; if not, write to the Free Software Foundation,
  21    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  22
  23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  24                                           size_t length, reg_syntax_t syntax);
  25 static void re_compile_fastmap_iter (regex_t *bufp,
  26                                      const re_dfastate_t *init_state,
  27                                      char *fastmap);
  28 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  29 #ifdef RE_ENABLE_I18N
  30 static void free_charset (re_charset_t *cset);
  31 #endif /* RE_ENABLE_I18N */
  32 static void free_workarea_compile (regex_t *preg);
  33 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  34 #ifdef RE_ENABLE_I18N
  35 static void optimize_utf8 (re_dfa_t *dfa);
  36 #endif
  37 static reg_errcode_t analyze (regex_t *preg);
  38 static reg_errcode_t preorder (bin_tree_t *root,
  39                                reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                void *extra);
  41 static reg_errcode_t postorder (bin_tree_t *root,
  42                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                 void *extra);
  44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  47                                  bin_tree_t *node);
  48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  51 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  52 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  53                                    unsigned int constraint);
  54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  56                                          Idx node, bool root);
  57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  58 static Idx fetch_number (re_string_t *input, re_token_t *token,
  59                          reg_syntax_t syntax);
  60 static int peek_token (re_token_t *token, re_string_t *input,
  61                         reg_syntax_t syntax) internal_function;
  62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  63                           reg_syntax_t syntax, reg_errcode_t *err);
  64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  65                                   re_token_t *token, reg_syntax_t syntax,
  66                                   Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  68                                  re_token_t *token, reg_syntax_t syntax,
  69                                  Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  71                                      re_token_t *token, reg_syntax_t syntax,
  72                                      Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  74                                   re_token_t *token, reg_syntax_t syntax,
  75                                   Idx nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  77                                  re_dfa_t *dfa, re_token_t *token,
  78                                  reg_syntax_t syntax, reg_errcode_t *err);
  79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  80                                       re_token_t *token, reg_syntax_t syntax,
  81                                       reg_errcode_t *err);
  82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  83                                             re_string_t *regexp,
  84                                             re_token_t *token, int token_len,
  85                                             re_dfa_t *dfa,
  86                                             reg_syntax_t syntax,
  87                                             bool accept_hyphen);
  88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  89                                           re_string_t *regexp,
  90                                           re_token_t *token);
  91 #ifdef RE_ENABLE_I18N
  92 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  93                                         re_charset_t *mbcset,
  94                                         Idx *equiv_class_alloc,
  95                                         const unsigned char *name);
  96 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  97                                       bitset_t sbcset,
  98                                       re_charset_t *mbcset,
  99                                       Idx *char_class_alloc,
 100                                       const unsigned char *class_name,
 101                                       reg_syntax_t syntax);
 102 #else  /* not RE_ENABLE_I18N */
 103 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 104                                         const unsigned char *name);
 105 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 106                                       bitset_t sbcset,
 107                                       const unsigned char *class_name,
 108                                       reg_syntax_t syntax);
 109 #endif /* not RE_ENABLE_I18N */
 110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 111                                        RE_TRANSLATE_TYPE trans,
 112                                        const unsigned char *class_name,
 113                                        const unsigned char *extra,
 114                                        bool non_match, reg_errcode_t *err);
 115 static bin_tree_t *create_tree (re_dfa_t *dfa,
 116                                 bin_tree_t *left, bin_tree_t *right,
 117                                 re_token_type_t type);
 118 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 119                                       bin_tree_t *left, bin_tree_t *right,
 120                                       const re_token_t *token);
 121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 122 static void free_token (re_token_t *node);
 123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 125 \f
 126 /* This table gives an error message for each of the error codes listed
 127    in regex.h.  Obviously the order here has to be same as there.
 128    POSIX doesn't require that we do anything for REG_NOERROR,
 129    but why not be nice?  */
 130
 131 static const char __re_error_msgid[] =
 132   {
 133 #define REG_NOERROR_IDX 0
 134     gettext_noop ("Success")    /* REG_NOERROR */
 135     "\0"
 136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 137     gettext_noop ("No match")   /* REG_NOMATCH */
 138     "\0"
 139 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 140     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 141     "\0"
 142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 143     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 144     "\0"
 145 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 146     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 147     "\0"
 148 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 149     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 150     "\0"
 151 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 152     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 153     "\0"
 154 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 155     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 156     "\0"
 157 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 158     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 159     "\0"
 160 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 161     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 162     "\0"
 163 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 164     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 165     "\0"
 166 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 167     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 168     "\0"
 169 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 170     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 171     "\0"
 172 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 173     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 174     "\0"
 175 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 176     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 177     "\0"
 178 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 179     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 180     "\0"
 181 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 182     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 183   };
 184
 185 static const size_t __re_error_msgid_idx[] =
 186   {
 187     REG_NOERROR_IDX,
 188     REG_NOMATCH_IDX,
 189     REG_BADPAT_IDX,
 190     REG_ECOLLATE_IDX,
 191     REG_ECTYPE_IDX,
 192     REG_EESCAPE_IDX,
 193     REG_ESUBREG_IDX,
 194     REG_EBRACK_IDX,
 195     REG_EPAREN_IDX,
 196     REG_EBRACE_IDX,
 197     REG_BADBR_IDX,
 198     REG_ERANGE_IDX,
 199     REG_ESPACE_IDX,
 200     REG_BADRPT_IDX,
 201     REG_EEND_IDX,
 202     REG_ESIZE_IDX,
 203     REG_ERPAREN_IDX
 204   };
 205 \f
 206 /* Entry points for GNU code.  */
 207
 208 /* re_compile_pattern is the GNU regular expression compiler: it
 209    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 210    Returns 0 if the pattern was valid, otherwise an error string.
 211
 212    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 213    are set in BUFP on entry.  */
 214
 215 #ifdef _LIBC
 216 const char *
 217 re_compile_pattern (pattern, length, bufp)
 218     const char *pattern;
 219     size_t length;
 220     struct re_pattern_buffer *bufp;
 221 #else /* size_t might promote */
 222 const char *
 223 re_compile_pattern (const char *pattern, size_t length,
 224                     struct re_pattern_buffer *bufp)
 225 #endif
 226 {
 227   reg_errcode_t ret;
 228
 229   /* And GNU code determines whether or not to get register information
 230      by passing null for the REGS argument to re_match, etc., not by
 231      setting no_sub, unless RE_NO_SUB is set.  */
 232   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 233
 234   /* Match anchors at newline.  */
 235   bufp->newline_anchor = 1;
 236
 237   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 238
 239   if (!ret)
 240     return NULL;
 241   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 242 }
 243 #ifdef _LIBC
 244 weak_alias (__re_compile_pattern, re_compile_pattern)
 245 #endif
 246
 247 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 248    also be assigned to arbitrarily: each pattern buffer stores its own
 249    syntax, so it can be changed between regex compilations.  */
 250 /* This has no initializer because initialized variables in Emacs
 251    become read-only after dumping.  */
 252 reg_syntax_t re_syntax_options;
 253
 254
 255 /* Specify the precise syntax of regexps for compilation.  This provides
 256    for compatibility for various utilities which historically have
 257    different, incompatible syntaxes.
 258
 259    The argument SYNTAX is a bit mask comprised of the various bits
 260    defined in regex.h.  We return the old syntax.  */
 261
 262 reg_syntax_t
 263 re_set_syntax (syntax)
 264     reg_syntax_t syntax;
 265 {
 266   reg_syntax_t ret = re_syntax_options;
 267
 268   re_syntax_options = syntax;
 269   return ret;
 270 }
 271 #ifdef _LIBC
 272 weak_alias (__re_set_syntax, re_set_syntax)
 273 #endif
 274
 275 int
 276 re_compile_fastmap (bufp)
 277     struct re_pattern_buffer *bufp;
 278 {
 279   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 280   char *fastmap = bufp->fastmap;
 281
 282   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 283   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 284   if (dfa->init_state != dfa->init_state_word)
 285     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 286   if (dfa->init_state != dfa->init_state_nl)
 287     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 288   if (dfa->init_state != dfa->init_state_begbuf)
 289     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 290   bufp->fastmap_accurate = 1;
 291   return 0;
 292 }
 293 #ifdef _LIBC
 294 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 295 #endif
 296
 297 static inline void
 298 __attribute ((always_inline))
 299 re_set_fastmap (char *fastmap, bool icase, int ch)
 300 {
 301   fastmap[ch] = 1;
 302   if (icase)
 303     fastmap[tolower (ch)] = 1;
 304 }
 305
 306 /* Helper function for re_compile_fastmap.
 307    Compile fastmap for the initial_state INIT_STATE.  */
 308
 309 static void
 310 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 311                          char *fastmap)
 312 {
 313   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 314   Idx node_cnt;
 315   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 316   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 317     {
 318       Idx node = init_state->nodes.elems[node_cnt];
 319       re_token_type_t type = dfa->nodes[node].type;
 320
 321       if (type == CHARACTER)
 322         {
 323           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 324 #ifdef RE_ENABLE_I18N
 325           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 326             {
 327               unsigned char buf[MB_LEN_MAX];
 328               unsigned char *p;
 329               wchar_t wc;
 330               mbstate_t state;
 331
 332               p = buf;
 333               *p++ = dfa->nodes[node].opr.c;
 334               while (++node < dfa->nodes_len
 335                      && dfa->nodes[node].type == CHARACTER
 336                      && dfa->nodes[node].mb_partial)
 337                 *p++ = dfa->nodes[node].opr.c;
 338               memset (&state, '\0', sizeof (state));
 339               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 340                              &state) == p - buf
 341                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 342                       != (size_t) -1))
 343                 re_set_fastmap (fastmap, false, buf[0]);
 344             }
 345 #endif
 346         }
 347       else if (type == SIMPLE_BRACKET)
 348         {
 349           int i, ch;
 350           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 351             {
 352               int j;
 353               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 354               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 355                 if (w & ((bitset_word_t) 1 << j))
 356                   re_set_fastmap (fastmap, icase, ch);
 357             }
 358         }
 359 #ifdef RE_ENABLE_I18N
 360       else if (type == COMPLEX_BRACKET)
 361         {
 362           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 363           Idx i;
 364
 365 # ifdef _LIBC
 366           /* See if we have to try all bytes which start multiple collation
 367              elements.
 368              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 369                   collation element, and don't catch 'b' since 'b' is
 370                   the only collation element which starts from 'b' (and
 371                   it is caught by SIMPLE_BRACKET).  */
 372               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 373                   && (cset->ncoll_syms || cset->nranges))
 374                 {
 375                   const int32_t *table = (const int32_t *)
 376                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 377                   for (i = 0; i < SBC_MAX; ++i)
 378                     if (table[i] < 0)
 379                       re_set_fastmap (fastmap, icase, i);
 380                 }
 381 # endif /* _LIBC */
 382
 383           /* See if we have to start the match at all multibyte characters,
 384              i.e. where we would not find an invalid sequence.  This only
 385              applies to multibyte character sets; for single byte character
 386              sets, the SIMPLE_BRACKET again suffices.  */
 387           if (dfa->mb_cur_max > 1
 388               && (cset->nchar_classes || cset->non_match || cset->nranges
 389 # ifdef _LIBC
 390                   || cset->nequiv_classes
 391 # endif /* _LIBC */
 392                  ))
 393             {
 394               unsigned char c = 0;
 395               do
 396                 {
 397                   mbstate_t mbs;
 398                   memset (&mbs, 0, sizeof (mbs));
 399                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 400                     re_set_fastmap (fastmap, false, (int) c);
 401                 }
 402               while (++c != 0);
 403             }
 404
 405           else
 406             {
 407               /* ... Else catch all bytes which can start the mbchars.  */
 408               for (i = 0; i < cset->nmbchars; ++i)
 409                 {
 410                   char buf[256];
 411                   mbstate_t state;
 412                   memset (&state, '\0', sizeof (state));
 413                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 414                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 415                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 416                     {
 417                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 418                           != (size_t) -1)
 419                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 420                     }
 421                 }
 422             }
 423         }
 424 #endif /* RE_ENABLE_I18N */
 425       else if (type == OP_PERIOD
 426 #ifdef RE_ENABLE_I18N
 427                || type == OP_UTF8_PERIOD
 428 #endif /* RE_ENABLE_I18N */
 429                || type == END_OF_RE)
 430         {
 431           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 432           if (type == END_OF_RE)
 433             bufp->can_be_null = 1;
 434           return;
 435         }
 436     }
 437 }
 438 \f
 439 /* Entry point for POSIX code.  */
 440 /* regcomp takes a regular expression as a string and compiles it.
 441
 442    PREG is a regex_t *.  We do not expect any fields to be initialized,
 443    since POSIX says we shouldn't.  Thus, we set
 444
 445      `buffer' to the compiled pattern;
 446      `used' to the length of the compiled pattern;
 447      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 448        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 449        RE_SYNTAX_POSIX_BASIC;
 450      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 451      `fastmap' to an allocated space for the fastmap;
 452      `fastmap_accurate' to zero;
 453      `re_nsub' to the number of subexpressions in PATTERN.
 454
 455    PATTERN is the address of the pattern string.
 456
 457    CFLAGS is a series of bits which affect compilation.
 458
 459      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 460      use POSIX basic syntax.
 461
 462      If REG_NEWLINE is set, then . and [^...] don't match newline.
 463      Also, regexec will try a match beginning after every newline.
 464
 465      If REG_ICASE is set, then we considers upper- and lowercase
 466      versions of letters to be equivalent when matching.
 467
 468      If REG_NOSUB is set, then when PREG is passed to regexec, that
 469      routine will report only success or failure, and nothing about the
 470      registers.
 471
 472    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 473    the return codes and their meanings.)  */
 474
 475 int
 476 regcomp (preg, pattern, cflags)
 477     regex_t *_Restrict_ preg;
 478     const char *_Restrict_ pattern;
 479     int cflags;
 480 {
 481   reg_errcode_t ret;
 482   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 483                          : RE_SYNTAX_POSIX_BASIC);
 484
 485   preg->buffer = NULL;
 486   preg->allocated = 0;
 487   preg->used = 0;
 488
 489   /* Try to allocate space for the fastmap.  */
 490   preg->fastmap = re_malloc (char, SBC_MAX);
 491   if (BE (preg->fastmap == NULL, 0))
 492     return REG_ESPACE;
 493
 494   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 495
 496   /* If REG_NEWLINE is set, newlines are treated differently.  */
 497   if (cflags & REG_NEWLINE)
 498     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 499       syntax &= ~RE_DOT_NEWLINE;
 500       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 501       /* It also changes the matching behavior.  */
 502       preg->newline_anchor = 1;
 503     }
 504   else
 505     preg->newline_anchor = 0;
 506   preg->no_sub = !!(cflags & REG_NOSUB);
 507   preg->translate = NULL;
 508
 509   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 510
 511   /* POSIX doesn't distinguish between an unmatched open-group and an
 512      unmatched close-group: both are REG_EPAREN.  */
 513   if (ret == REG_ERPAREN)
 514     ret = REG_EPAREN;
 515
 516   /* We have already checked preg->fastmap != NULL.  */
 517   if (BE (ret == REG_NOERROR, 1))
 518     /* Compute the fastmap now, since regexec cannot modify the pattern
 519        buffer.  This function never fails in this implementation.  */
 520     (void) re_compile_fastmap (preg);
 521   else
 522     {
 523       /* Some error occurred while compiling the expression.  */
 524       re_free (preg->fastmap);
 525       preg->fastmap = NULL;
 526     }
 527
 528   return (int) ret;
 529 }
 530 #ifdef _LIBC
 531 weak_alias (__regcomp, regcomp)
 532 #endif
 533
 534 /* Returns a message corresponding to an error code, ERRCODE, returned
 535    from either regcomp or regexec.   We don't use PREG here.  */
 536
 537 #ifdef _LIBC
 538 size_t
 539 regerror (errcode, preg, errbuf, errbuf_size)
 540     int errcode;
 541     const regex_t *_Restrict_ preg;
 542     char *_Restrict_ errbuf;
 543     size_t errbuf_size;
 544 #else /* size_t might promote */
 545 size_t
 546 regerror (int errcode, const regex_t *_Restrict_ preg,
 547           char *_Restrict_ errbuf, size_t errbuf_size)
 548 #endif
 549 {
 550   const char *msg;
 551   size_t msg_size;
 552
 553   if (BE (errcode < 0
 554           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 555                                / sizeof (__re_error_msgid_idx[0])), 0))
 556     /* Only error codes returned by the rest of the code should be passed
 557        to this routine.  If we are given anything else, or if other regex
 558        code generates an invalid error code, then the program has a bug.
 559        Dump core so we can fix it.  */
 560     abort ();
 561
 562   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 563
 564   msg_size = strlen (msg) + 1; /* Includes the null.  */
 565
 566   if (BE (errbuf_size != 0, 1))
 567     {
 568       size_t cpy_size = msg_size;
 569       if (BE (msg_size > errbuf_size, 0))
 570         {
 571           cpy_size = errbuf_size - 1;
 572           errbuf[cpy_size] = '\0';
 573         }
 574       memcpy (errbuf, msg, cpy_size);
 575     }
 576
 577   return msg_size;
 578 }
 579 #ifdef _LIBC
 580 weak_alias (__regerror, regerror)
 581 #endif
 582
 583
 584 #ifdef RE_ENABLE_I18N
 585 /* This static array is used for the map to single-byte characters when
 586    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 587    it the same all the time.  UTF-8 is the preferred encoding so this is
 588    a worthwhile optimization.  */
 589 static const bitset_t utf8_sb_map =
 590 {
 591   /* Set the first 128 bits.  */
 592 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 593 #  error "bitset_word_t is narrower than 32 bits"
 594 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 595   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 596 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 597   BITSET_WORD_MAX, BITSET_WORD_MAX,
 598 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 599   BITSET_WORD_MAX,
 600 # endif
 601   (BITSET_WORD_MAX
 602    >> (SBC_MAX % BITSET_WORD_BITS == 0
 603        ? 0
 604        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 605 };
 606 #endif
 607
 608
 609 static void
 610 free_dfa_content (re_dfa_t *dfa)
 611 {
 612   Idx i, j;
 613
 614   if (dfa->nodes)
 615     for (i = 0; i < dfa->nodes_len; ++i)
 616       free_token (dfa->nodes + i);
 617   re_free (dfa->nexts);
 618   for (i = 0; i < dfa->nodes_len; ++i)
 619     {
 620       if (dfa->eclosures != NULL)
 621         re_node_set_free (dfa->eclosures + i);
 622       if (dfa->inveclosures != NULL)
 623         re_node_set_free (dfa->inveclosures + i);
 624       if (dfa->edests != NULL)
 625         re_node_set_free (dfa->edests + i);
 626     }
 627   re_free (dfa->edests);
 628   re_free (dfa->eclosures);
 629   re_free (dfa->inveclosures);
 630   re_free (dfa->nodes);
 631
 632   if (dfa->state_table)
 633     for (i = 0; i <= dfa->state_hash_mask; ++i)
 634       {
 635         struct re_state_table_entry *entry = dfa->state_table + i;
 636         for (j = 0; j < entry->num; ++j)
 637           {
 638             re_dfastate_t *state = entry->array[j];
 639             free_state (state);
 640           }
 641         re_free (entry->array);
 642       }
 643   re_free (dfa->state_table);
 644 #ifdef RE_ENABLE_I18N
 645   if (dfa->sb_char != utf8_sb_map)
 646     re_free (dfa->sb_char);
 647 #endif
 648   re_free (dfa->subexp_map);
 649 #ifdef DEBUG
 650   re_free (dfa->re_str);
 651 #endif
 652
 653   re_free (dfa);
 654 }
 655
 656
 657 /* Free dynamically allocated space used by PREG.  */
 658
 659 void
 660 regfree (preg)
 661     regex_t *preg;
 662 {
 663   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 664   if (BE (dfa != NULL, 1))
 665     free_dfa_content (dfa);
 666   preg->buffer = NULL;
 667   preg->allocated = 0;
 668
 669   re_free (preg->fastmap);
 670   preg->fastmap = NULL;
 671
 672   re_free (preg->translate);
 673   preg->translate = NULL;
 674 }
 675 #ifdef _LIBC
 676 weak_alias (__regfree, regfree)
 677 #endif
 678 \f
 679 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 680    them unless specifically requested.  */
 681
 682 #if defined _REGEX_RE_COMP || defined _LIBC
 683
 684 /* BSD has one and only one pattern buffer.  */
 685 static struct re_pattern_buffer re_comp_buf;
 686
 687 char *
 688 # ifdef _LIBC
 689 /* Make these definitions weak in libc, so POSIX programs can redefine
 690    these names if they don't use our functions, and still use
 691    regcomp/regexec above without link errors.  */
 692 weak_function
 693 # endif
 694 re_comp (s)
 695      const char *s;
 696 {
 697   reg_errcode_t ret;
 698   char *fastmap;
 699
 700   if (!s)
 701     {
 702       if (!re_comp_buf.buffer)
 703         return gettext ("No previous regular expression");
 704       return 0;
 705     }
 706
 707   if (re_comp_buf.buffer)
 708     {
 709       fastmap = re_comp_buf.fastmap;
 710       re_comp_buf.fastmap = NULL;
 711       __regfree (&re_comp_buf);
 712       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 713       re_comp_buf.fastmap = fastmap;
 714     }
 715
 716   if (re_comp_buf.fastmap == NULL)
 717     {
 718       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 719       if (re_comp_buf.fastmap == NULL)
 720         return (char *) gettext (__re_error_msgid
 721                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 722     }
 723
 724   /* Since `re_exec' always passes NULL for the `regs' argument, we
 725      don't need to initialize the pattern buffer fields which affect it.  */
 726
 727   /* Match anchors at newlines.  */
 728   re_comp_buf.newline_anchor = 1;
 729
 730   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 731
 732   if (!ret)
 733     return NULL;
 734
 735   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 736   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 737 }
 738
 739 #ifdef _LIBC
 740 libc_freeres_fn (free_mem)
 741 {
 742   __regfree (&re_comp_buf);
 743 }
 744 #endif
 745
 746 #endif /* _REGEX_RE_COMP */
 747 \f
 748 /* Internal entry point.
 749    Compile the regular expression PATTERN, whose length is LENGTH.
 750    SYNTAX indicate regular expression's syntax.  */
 751
 752 static reg_errcode_t
 753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 754                      reg_syntax_t syntax)
 755 {
 756   reg_errcode_t err = REG_NOERROR;
 757   re_dfa_t *dfa;
 758   re_string_t regexp;
 759
 760   /* Initialize the pattern buffer.  */
 761   preg->fastmap_accurate = 0;
 762   preg->syntax = syntax;
 763   preg->not_bol = preg->not_eol = 0;
 764   preg->used = 0;
 765   preg->re_nsub = 0;
 766   preg->can_be_null = 0;
 767   preg->regs_allocated = REGS_UNALLOCATED;
 768
 769   /* Initialize the dfa.  */
 770   dfa = (re_dfa_t *) preg->buffer;
 771   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 772     {
 773       /* If zero allocated, but buffer is non-null, try to realloc
 774          enough space.  This loses if buffer's address is bogus, but
 775          that is the user's responsibility.  If ->buffer is NULL this
 776          is a simple allocation.  */
 777       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 778       if (dfa == NULL)
 779         return REG_ESPACE;
 780       preg->allocated = sizeof (re_dfa_t);
 781       preg->buffer = (unsigned char *) dfa;
 782     }
 783   preg->used = sizeof (re_dfa_t);
 784
 785   err = init_dfa (dfa, length);
 786   if (BE (err != REG_NOERROR, 0))
 787     {
 788       free_dfa_content (dfa);
 789       preg->buffer = NULL;
 790       preg->allocated = 0;
 791       return err;
 792     }
 793 #ifdef DEBUG
 794   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 795   dfa->re_str = re_malloc (char, length + 1);
 796   strncpy (dfa->re_str, pattern, length + 1);
 797 #endif
 798
 799   __libc_lock_init (dfa->lock);
 800
 801   err = re_string_construct (&regexp, pattern, length, preg->translate,
 802                              (syntax & RE_ICASE) != 0, dfa);
 803   if (BE (err != REG_NOERROR, 0))
 804     {
 805     re_compile_internal_free_return:
 806       free_workarea_compile (preg);
 807       re_string_destruct (&regexp);
 808       free_dfa_content (dfa);
 809       preg->buffer = NULL;
 810       preg->allocated = 0;
 811       return err;
 812     }
 813
 814   /* Parse the regular expression, and build a structure tree.  */
 815   preg->re_nsub = 0;
 816   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 817   if (BE (dfa->str_tree == NULL, 0))
 818     goto re_compile_internal_free_return;
 819
 820   /* Analyze the tree and create the nfa.  */
 821   err = analyze (preg);
 822   if (BE (err != REG_NOERROR, 0))
 823     goto re_compile_internal_free_return;
 824
 825 #ifdef RE_ENABLE_I18N
 826   /* If possible, do searching in single byte encoding to speed things up.  */
 827   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 828     optimize_utf8 (dfa);
 829 #endif
 830
 831   /* Then create the initial state of the dfa.  */
 832   err = create_initial_state (dfa);
 833
 834   /* Release work areas.  */
 835   free_workarea_compile (preg);
 836   re_string_destruct (&regexp);
 837
 838   if (BE (err != REG_NOERROR, 0))
 839     {
 840       free_dfa_content (dfa);
 841       preg->buffer = NULL;
 842       preg->allocated = 0;
 843     }
 844
 845   return err;
 846 }
 847
 848 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 849    as the initial length of some arrays.  */
 850
 851 static reg_errcode_t
 852 init_dfa (re_dfa_t *dfa, size_t pat_len)
 853 {
 854   __re_size_t table_size;
 855 #ifndef _LIBC
 856   char *codeset_name;
 857 #endif
 858 #ifdef RE_ENABLE_I18N
 859   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 860 #else
 861   size_t max_i18n_object_size = 0;
 862 #endif
 863   size_t max_object_size =
 864     MAX (sizeof (struct re_state_table_entry),
 865          MAX (sizeof (re_token_t),
 866               MAX (sizeof (re_node_set),
 867                    MAX (sizeof (regmatch_t),
 868                         max_i18n_object_size))));
 869
 870   memset (dfa, '\0', sizeof (re_dfa_t));
 871
 872   /* Force allocation of str_tree_storage the first time.  */
 873   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 874
 875   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 876      calculation below, and for similar doubling calculations
 877      elsewhere.  And it's <= rather than <, because some of the
 878      doubling calculations add 1 afterwards.  */
 879   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 880     return REG_ESPACE;
 881
 882   dfa->nodes_alloc = pat_len + 1;
 883   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 884
 885   /*  table_size = 2 ^ ceil(log pat_len) */
 886   for (table_size = 1; ; table_size <<= 1)
 887     if (table_size > pat_len)
 888       break;
 889
 890   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 891   dfa->state_hash_mask = table_size - 1;
 892
 893   dfa->mb_cur_max = MB_CUR_MAX;
 894 #ifdef _LIBC
 895   if (dfa->mb_cur_max == 6
 896       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 897     dfa->is_utf8 = 1;
 898   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 899                        != 0);
 900 #else
 901   codeset_name = nl_langinfo (CODESET);
 902   if (strcasecmp (codeset_name, "UTF-8") == 0
 903       || strcasecmp (codeset_name, "UTF8") == 0)
 904     dfa->is_utf8 = 1;
 905
 906   /* We check exhaustively in the loop below if this charset is a
 907      superset of ASCII.  */
 908   dfa->map_notascii = 0;
 909 #endif
 910
 911 #ifdef RE_ENABLE_I18N
 912   if (dfa->mb_cur_max > 1)
 913     {
 914       if (dfa->is_utf8)
 915         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 916       else
 917         {
 918           int i, j, ch;
 919
 920           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 921           if (BE (dfa->sb_char == NULL, 0))
 922             return REG_ESPACE;
 923
 924           /* Set the bits corresponding to single byte chars.  */
 925           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 926             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 927               {
 928                 wint_t wch = __btowc (ch);
 929                 if (wch != WEOF)
 930                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 931 # ifndef _LIBC
 932                 if (isascii (ch) && wch != ch)
 933                   dfa->map_notascii = 1;
 934 # endif
 935               }
 936         }
 937     }
 938 #endif
 939
 940   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 941     return REG_ESPACE;
 942   return REG_NOERROR;
 943 }
 944
 945 /* Initialize WORD_CHAR table, which indicate which character is
 946    "word".  In this case "word" means that it is the word construction
 947    character used by some operators like "\<", "\>", etc.  */
 948
 949 static void
 950 internal_function
 951 init_word_char (re_dfa_t *dfa)
 952 {
 953   int i, j, ch;
 954   dfa->word_ops_used = 1;
 955   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 956     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 957       if (isalnum (ch) || ch == '_')
 958         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 959 }
 960
 961 /* Free the work area which are only used while compiling.  */
 962
 963 static void
 964 free_workarea_compile (regex_t *preg)
 965 {
 966   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 967   bin_tree_storage_t *storage, *next;
 968   for (storage = dfa->str_tree_storage; storage; storage = next)
 969     {
 970       next = storage->next;
 971       re_free (storage);
 972     }
 973   dfa->str_tree_storage = NULL;
 974   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 975   dfa->str_tree = NULL;
 976   re_free (dfa->org_indices);
 977   dfa->org_indices = NULL;
 978 }
 979
 980 /* Create initial states for all contexts.  */
 981
 982 static reg_errcode_t
 983 create_initial_state (re_dfa_t *dfa)
 984 {
 985   Idx first, i;
 986   reg_errcode_t err;
 987   re_node_set init_nodes;
 988
 989   /* Initial states have the epsilon closure of the node which is
 990      the first node of the regular expression.  */
 991   first = dfa->str_tree->first->node_idx;
 992   dfa->init_node = first;
 993   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 994   if (BE (err != REG_NOERROR, 0))
 995     return err;
 996
 997   /* The back-references which are in initial states can epsilon transit,
 998      since in this case all of the subexpressions can be null.
 999      Then we add epsilon closures of the nodes which are the next nodes of
1000      the back-references.  */
1001   if (dfa->nbackref > 0)
1002     for (i = 0; i < init_nodes.nelem; ++i)
1003       {
1004         Idx node_idx = init_nodes.elems[i];
1005         re_token_type_t type = dfa->nodes[node_idx].type;
1006
1007         Idx clexp_idx;
1008         if (type != OP_BACK_REF)
1009           continue;
1010         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1011           {
1012             re_token_t *clexp_node;
1013             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1014             if (clexp_node->type == OP_CLOSE_SUBEXP
1015                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1016               break;
1017           }
1018         if (clexp_idx == init_nodes.nelem)
1019           continue;
1020
1021         if (type == OP_BACK_REF)
1022           {
1023             Idx dest_idx = dfa->edests[node_idx].elems[0];
1024             if (!re_node_set_contains (&init_nodes, dest_idx))
1025               {
1026                 reg_errcode_t merge_err
1027                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1028                 if (merge_err != REG_NOERROR)
1029                   return merge_err;
1030                 i = 0;
1031               }
1032           }
1033       }
1034
1035   /* It must be the first time to invoke acquire_state.  */
1036   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1037   /* We don't check ERR here, since the initial state must not be NULL.  */
1038   if (BE (dfa->init_state == NULL, 0))
1039     return err;
1040   if (dfa->init_state->has_constraint)
1041     {
1042       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1043                                                        CONTEXT_WORD);
1044       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1045                                                      CONTEXT_NEWLINE);
1046       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1047                                                          &init_nodes,
1048                                                          CONTEXT_NEWLINE
1049                                                          | CONTEXT_BEGBUF);
1050       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1051               || dfa->init_state_begbuf == NULL, 0))
1052         return err;
1053     }
1054   else
1055     dfa->init_state_word = dfa->init_state_nl
1056       = dfa->init_state_begbuf = dfa->init_state;
1057
1058   re_node_set_free (&init_nodes);
1059   return REG_NOERROR;
1060 }
1061 \f
1062 #ifdef RE_ENABLE_I18N
1063 /* If it is possible to do searching in single byte encoding instead of UTF-8
1064    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1065    DFA nodes where needed.  */
1066
1067 static void
1068 optimize_utf8 (re_dfa_t *dfa)
1069 {
1070   Idx node;
1071   int i;
1072   bool mb_chars = false;
1073   bool has_period = false;
1074
1075   for (node = 0; node < dfa->nodes_len; ++node)
1076     switch (dfa->nodes[node].type)
1077       {
1078       case CHARACTER:
1079         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1080           mb_chars = true;
1081         break;
1082       case ANCHOR:
1083         switch (dfa->nodes[node].opr.ctx_type)
1084           {
1085           case LINE_FIRST:
1086           case LINE_LAST:
1087           case BUF_FIRST:
1088           case BUF_LAST:
1089             break;
1090           default:
1091             /* Word anchors etc. cannot be handled.  It's okay to test
1092                opr.ctx_type since constraints (for all DFA nodes) are
1093                created by ORing one or more opr.ctx_type values.  */
1094             return;
1095           }
1096         break;
1097       case OP_PERIOD:
1098         has_period = true;
1099         break;
1100       case OP_BACK_REF:
1101       case OP_ALT:
1102       case END_OF_RE:
1103       case OP_DUP_ASTERISK:
1104       case OP_OPEN_SUBEXP:
1105       case OP_CLOSE_SUBEXP:
1106         break;
1107       case COMPLEX_BRACKET:
1108         return;
1109       case SIMPLE_BRACKET:
1110         /* Just double check.  */
1111         {
1112           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1113                         ? 0
1114                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1115           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1116             {
1117               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1118                 return;
1119               rshift = 0;
1120             }
1121         }
1122         break;
1123       default:
1124         abort ();
1125       }
1126
1127   if (mb_chars || has_period)
1128     for (node = 0; node < dfa->nodes_len; ++node)
1129       {
1130         if (dfa->nodes[node].type == CHARACTER
1131             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1132           dfa->nodes[node].mb_partial = 0;
1133         else if (dfa->nodes[node].type == OP_PERIOD)
1134           dfa->nodes[node].type = OP_UTF8_PERIOD;
1135       }
1136
1137   /* The search can be in single byte locale.  */
1138   dfa->mb_cur_max = 1;
1139   dfa->is_utf8 = 0;
1140   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1141 }
1142 #endif
1143 \f
1144 /* Analyze the structure tree, and calculate "first", "next", "edest",
1145    "eclosure", and "inveclosure".  */
1146
1147 static reg_errcode_t
1148 analyze (regex_t *preg)
1149 {
1150   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1151   reg_errcode_t ret;
1152
1153   /* Allocate arrays.  */
1154   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1155   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1156   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1157   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1158   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1159           || dfa->eclosures == NULL, 0))
1160     return REG_ESPACE;
1161
1162   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1163   if (dfa->subexp_map != NULL)
1164     {
1165       Idx i;
1166       for (i = 0; i < preg->re_nsub; i++)
1167         dfa->subexp_map[i] = i;
1168       preorder (dfa->str_tree, optimize_subexps, dfa);
1169       for (i = 0; i < preg->re_nsub; i++)
1170         if (dfa->subexp_map[i] != i)
1171           break;
1172       if (i == preg->re_nsub)
1173         {
1174           free (dfa->subexp_map);
1175           dfa->subexp_map = NULL;
1176         }
1177     }
1178
1179   ret = postorder (dfa->str_tree, lower_subexps, preg);
1180   if (BE (ret != REG_NOERROR, 0))
1181     return ret;
1182   ret = postorder (dfa->str_tree, calc_first, dfa);
1183   if (BE (ret != REG_NOERROR, 0))
1184     return ret;
1185   preorder (dfa->str_tree, calc_next, dfa);
1186   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1187   if (BE (ret != REG_NOERROR, 0))
1188     return ret;
1189   ret = calc_eclosure (dfa);
1190   if (BE (ret != REG_NOERROR, 0))
1191     return ret;
1192
1193   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1194      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1195   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1196       || dfa->nbackref)
1197     {
1198       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1199       if (BE (dfa->inveclosures == NULL, 0))
1200         return REG_ESPACE;
1201       ret = calc_inveclosure (dfa);
1202     }
1203
1204   return ret;
1205 }
1206
1207 /* Our parse trees are very unbalanced, so we cannot use a stack to
1208    implement parse tree visits.  Instead, we use parent pointers and
1209    some hairy code in these two functions.  */
1210 static reg_errcode_t
1211 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1212            void *extra)
1213 {
1214   bin_tree_t *node, *prev;
1215
1216   for (node = root; ; )
1217     {
1218       /* Descend down the tree, preferably to the left (or to the right
1219          if that's the only child).  */
1220       while (node->left || node->right)
1221         if (node->left)
1222           node = node->left;
1223         else
1224           node = node->right;
1225
1226       do
1227         {
1228           reg_errcode_t err = fn (extra, node);
1229           if (BE (err != REG_NOERROR, 0))
1230             return err;
1231           if (node->parent == NULL)
1232             return REG_NOERROR;
1233           prev = node;
1234           node = node->parent;
1235         }
1236       /* Go up while we have a node that is reached from the right.  */
1237       while (node->right == prev || node->right == NULL);
1238       node = node->right;
1239     }
1240 }
1241
1242 static reg_errcode_t
1243 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1244           void *extra)
1245 {
1246   bin_tree_t *node;
1247
1248   for (node = root; ; )
1249     {
1250       reg_errcode_t err = fn (extra, node);
1251       if (BE (err != REG_NOERROR, 0))
1252         return err;
1253
1254       /* Go to the left node, or up and to the right.  */
1255       if (node->left)
1256         node = node->left;
1257       else
1258         {
1259           bin_tree_t *prev = NULL;
1260           while (node->right == prev || node->right == NULL)
1261             {
1262               prev = node;
1263               node = node->parent;
1264               if (!node)
1265                 return REG_NOERROR;
1266             }
1267           node = node->right;
1268         }
1269     }
1270 }
1271
1272 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1273    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1274    backreferences as well.  Requires a preorder visit.  */
1275 static reg_errcode_t
1276 optimize_subexps (void *extra, bin_tree_t *node)
1277 {
1278   re_dfa_t *dfa = (re_dfa_t *) extra;
1279
1280   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1281     {
1282       int idx = node->token.opr.idx;
1283       node->token.opr.idx = dfa->subexp_map[idx];
1284       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1285     }
1286
1287   else if (node->token.type == SUBEXP
1288            && node->left && node->left->token.type == SUBEXP)
1289     {
1290       Idx other_idx = node->left->token.opr.idx;
1291
1292       node->left = node->left->left;
1293       if (node->left)
1294         node->left->parent = node;
1295
1296       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1297       if (other_idx < BITSET_WORD_BITS)
1298         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1299     }
1300
1301   return REG_NOERROR;
1302 }
1303
1304 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1305    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1306 static reg_errcode_t
1307 lower_subexps (void *extra, bin_tree_t *node)
1308 {
1309   regex_t *preg = (regex_t *) extra;
1310   reg_errcode_t err = REG_NOERROR;
1311
1312   if (node->left && node->left->token.type == SUBEXP)
1313     {
1314       node->left = lower_subexp (&err, preg, node->left);
1315       if (node->left)
1316         node->left->parent = node;
1317     }
1318   if (node->right && node->right->token.type == SUBEXP)
1319     {
1320       node->right = lower_subexp (&err, preg, node->right);
1321       if (node->right)
1322         node->right->parent = node;
1323     }
1324
1325   return err;
1326 }
1327
1328 static bin_tree_t *
1329 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1330 {
1331   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1332   bin_tree_t *body = node->left;
1333   bin_tree_t *op, *cls, *tree1, *tree;
1334
1335   if (preg->no_sub
1336       /* We do not optimize empty subexpressions, because otherwise we may
1337          have bad CONCAT nodes with NULL children.  This is obviously not
1338          very common, so we do not lose much.  An example that triggers
1339          this case is the sed "script" /\(\)/x.  */
1340       && node->left != NULL
1341       && (node->token.opr.idx >= BITSET_WORD_BITS
1342           || !(dfa->used_bkref_map
1343                & ((bitset_word_t) 1 << node->token.opr.idx))))
1344     return node->left;
1345
1346   /* Convert the SUBEXP node to the concatenation of an
1347      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1348   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1349   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1350   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1351   tree = create_tree (dfa, op, tree1, CONCAT);
1352   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1353     {
1354       *err = REG_ESPACE;
1355       return NULL;
1356     }
1357
1358   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1359   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1360   return tree;
1361 }
1362
1363 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1364    nodes.  Requires a postorder visit.  */
1365 static reg_errcode_t
1366 calc_first (void *extra, bin_tree_t *node)
1367 {
1368   re_dfa_t *dfa = (re_dfa_t *) extra;
1369   if (node->token.type == CONCAT)
1370     {
1371       node->first = node->left->first;
1372       node->node_idx = node->left->node_idx;
1373     }
1374   else
1375     {
1376       node->first = node;
1377       node->node_idx = re_dfa_add_node (dfa, node->token);
1378       if (BE (node->node_idx == REG_MISSING, 0))
1379         return REG_ESPACE;
1380       if (node->token.type == ANCHOR)
1381         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1382     }
1383   return REG_NOERROR;
1384 }
1385
1386 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1387 static reg_errcode_t
1388 calc_next (void *extra, bin_tree_t *node)
1389 {
1390   switch (node->token.type)
1391     {
1392     case OP_DUP_ASTERISK:
1393       node->left->next = node;
1394       break;
1395     case CONCAT:
1396       node->left->next = node->right->first;
1397       node->right->next = node->next;
1398       break;
1399     default:
1400       if (node->left)
1401         node->left->next = node->next;
1402       if (node->right)
1403         node->right->next = node->next;
1404       break;
1405     }
1406   return REG_NOERROR;
1407 }
1408
1409 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1410 static reg_errcode_t
1411 link_nfa_nodes (void *extra, bin_tree_t *node)
1412 {
1413   re_dfa_t *dfa = (re_dfa_t *) extra;
1414   Idx idx = node->node_idx;
1415   reg_errcode_t err = REG_NOERROR;
1416
1417   switch (node->token.type)
1418     {
1419     case CONCAT:
1420       break;
1421
1422     case END_OF_RE:
1423       assert (node->next == NULL);
1424       break;
1425
1426     case OP_DUP_ASTERISK:
1427     case OP_ALT:
1428       {
1429         Idx left, right;
1430         dfa->has_plural_match = 1;
1431         if (node->left != NULL)
1432           left = node->left->first->node_idx;
1433         else
1434           left = node->next->node_idx;
1435         if (node->right != NULL)
1436           right = node->right->first->node_idx;
1437         else
1438           right = node->next->node_idx;
1439         assert (REG_VALID_INDEX (left));
1440         assert (REG_VALID_INDEX (right));
1441         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1442       }
1443       break;
1444
1445     case ANCHOR:
1446     case OP_OPEN_SUBEXP:
1447     case OP_CLOSE_SUBEXP:
1448       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1449       break;
1450
1451     case OP_BACK_REF:
1452       dfa->nexts[idx] = node->next->node_idx;
1453       if (node->token.type == OP_BACK_REF)
1454         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1455       break;
1456
1457     default:
1458       assert (!IS_EPSILON_NODE (node->token.type));
1459       dfa->nexts[idx] = node->next->node_idx;
1460       break;
1461     }
1462
1463   return err;
1464 }
1465
1466 /* Duplicate the epsilon closure of the node ROOT_NODE.
1467    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1468    to their own constraint.  */
1469
1470 static reg_errcode_t
1471 internal_function
1472 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1473                         Idx root_node, unsigned int init_constraint)
1474 {
1475   Idx org_node, clone_node;
1476   bool ok;
1477   unsigned int constraint = init_constraint;
1478   for (org_node = top_org_node, clone_node = top_clone_node;;)
1479     {
1480       Idx org_dest, clone_dest;
1481       if (dfa->nodes[org_node].type == OP_BACK_REF)
1482         {
1483           /* If the back reference epsilon-transit, its destination must
1484              also have the constraint.  Then duplicate the epsilon closure
1485              of the destination of the back reference, and store it in
1486              edests of the back reference.  */
1487           org_dest = dfa->nexts[org_node];
1488           re_node_set_empty (dfa->edests + clone_node);
1489           clone_dest = duplicate_node (dfa, org_dest, constraint);
1490           if (BE (clone_dest == REG_MISSING, 0))
1491             return REG_ESPACE;
1492           dfa->nexts[clone_node] = dfa->nexts[org_node];
1493           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1494           if (BE (! ok, 0))
1495             return REG_ESPACE;
1496         }
1497       else if (dfa->edests[org_node].nelem == 0)
1498         {
1499           /* In case of the node can't epsilon-transit, don't duplicate the
1500              destination and store the original destination as the
1501              destination of the node.  */
1502           dfa->nexts[clone_node] = dfa->nexts[org_node];
1503           break;
1504         }
1505       else if (dfa->edests[org_node].nelem == 1)
1506         {
1507           /* In case of the node can epsilon-transit, and it has only one
1508              destination.  */
1509           org_dest = dfa->edests[org_node].elems[0];
1510           re_node_set_empty (dfa->edests + clone_node);
1511           /* If the node is root_node itself, it means the epsilon closure
1512              has a loop.  Then tie it to the destination of the root_node.  */
1513           if (org_node == root_node && clone_node != org_node)
1514             {
1515               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1516               if (BE (! ok, 0))
1517                 return REG_ESPACE;
1518               break;
1519             }
1520           /* In case the node has another constraint, append it.  */
1521           constraint |= dfa->nodes[org_node].constraint;
1522           clone_dest = duplicate_node (dfa, org_dest, constraint);
1523           if (BE (clone_dest == REG_MISSING, 0))
1524             return REG_ESPACE;
1525           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1526           if (BE (! ok, 0))
1527             return REG_ESPACE;
1528         }
1529       else /* dfa->edests[org_node].nelem == 2 */
1530         {
1531           /* In case of the node can epsilon-transit, and it has two
1532              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1533           org_dest = dfa->edests[org_node].elems[0];
1534           re_node_set_empty (dfa->edests + clone_node);
1535           /* Search for a duplicated node which satisfies the constraint.  */
1536           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1537           if (clone_dest == REG_MISSING)
1538             {
1539               /* There is no such duplicated node, create a new one.  */
1540               reg_errcode_t err;
1541               clone_dest = duplicate_node (dfa, org_dest, constraint);
1542               if (BE (clone_dest == REG_MISSING, 0))
1543                 return REG_ESPACE;
1544               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1545               if (BE (! ok, 0))
1546                 return REG_ESPACE;
1547               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1548                                             root_node, constraint);
1549               if (BE (err != REG_NOERROR, 0))
1550                 return err;
1551             }
1552           else
1553             {
1554               /* There is a duplicated node which satisfies the constraint,
1555                  use it to avoid infinite loop.  */
1556               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557               if (BE (! ok, 0))
1558                 return REG_ESPACE;
1559             }
1560
1561           org_dest = dfa->edests[org_node].elems[1];
1562           clone_dest = duplicate_node (dfa, org_dest, constraint);
1563           if (BE (clone_dest == REG_MISSING, 0))
1564             return REG_ESPACE;
1565           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566           if (BE (! ok, 0))
1567             return REG_ESPACE;
1568         }
1569       org_node = org_dest;
1570       clone_node = clone_dest;
1571     }
1572   return REG_NOERROR;
1573 }
1574
1575 /* Search for a node which is duplicated from the node ORG_NODE, and
1576    satisfies the constraint CONSTRAINT.  */
1577
1578 static Idx
1579 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1580                         unsigned int constraint)
1581 {
1582   Idx idx;
1583   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1584     {
1585       if (org_node == dfa->org_indices[idx]
1586           && constraint == dfa->nodes[idx].constraint)
1587         return idx; /* Found.  */
1588     }
1589   return REG_MISSING; /* Not found.  */
1590 }
1591
1592 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1593    Return the index of the new node, or REG_MISSING if insufficient storage is
1594    available.  */
1595
1596 static Idx
1597 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1598 {
1599   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1600   if (BE (dup_idx != REG_MISSING, 1))
1601     {
1602       dfa->nodes[dup_idx].constraint = constraint;
1603       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1604       dfa->nodes[dup_idx].duplicated = 1;
1605
1606       /* Store the index of the original node.  */
1607       dfa->org_indices[dup_idx] = org_idx;
1608     }
1609   return dup_idx;
1610 }
1611
1612 static reg_errcode_t
1613 calc_inveclosure (re_dfa_t *dfa)
1614 {
1615   Idx src, idx;
1616   bool ok;
1617   for (idx = 0; idx < dfa->nodes_len; ++idx)
1618     re_node_set_init_empty (dfa->inveclosures + idx);
1619
1620   for (src = 0; src < dfa->nodes_len; ++src)
1621     {
1622       Idx *elems = dfa->eclosures[src].elems;
1623       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1624         {
1625           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1626           if (BE (! ok, 0))
1627             return REG_ESPACE;
1628         }
1629     }
1630
1631   return REG_NOERROR;
1632 }
1633
1634 /* Calculate "eclosure" for all the node in DFA.  */
1635
1636 static reg_errcode_t
1637 calc_eclosure (re_dfa_t *dfa)
1638 {
1639   Idx node_idx;
1640   bool incomplete;
1641 #ifdef DEBUG
1642   assert (dfa->nodes_len > 0);
1643 #endif
1644   incomplete = false;
1645   /* For each nodes, calculate epsilon closure.  */
1646   for (node_idx = 0; ; ++node_idx)
1647     {
1648       reg_errcode_t err;
1649       re_node_set eclosure_elem;
1650       if (node_idx == dfa->nodes_len)
1651         {
1652           if (!incomplete)
1653             break;
1654           incomplete = false;
1655           node_idx = 0;
1656         }
1657
1658 #ifdef DEBUG
1659       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1660 #endif
1661
1662       /* If we have already calculated, skip it.  */
1663       if (dfa->eclosures[node_idx].nelem != 0)
1664         continue;
1665       /* Calculate epsilon closure of `node_idx'.  */
1666       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1667       if (BE (err != REG_NOERROR, 0))
1668         return err;
1669
1670       if (dfa->eclosures[node_idx].nelem == 0)
1671         {
1672           incomplete = true;
1673           re_node_set_free (&eclosure_elem);
1674         }
1675     }
1676   return REG_NOERROR;
1677 }
1678
1679 /* Calculate epsilon closure of NODE.  */
1680
1681 static reg_errcode_t
1682 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1683 {
1684   reg_errcode_t err;
1685   Idx i;
1686   re_node_set eclosure;
1687   bool ok;
1688   bool incomplete = false;
1689   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1690   if (BE (err != REG_NOERROR, 0))
1691     return err;
1692
1693   /* This indicates that we are calculating this node now.
1694      We reference this value to avoid infinite loop.  */
1695   dfa->eclosures[node].nelem = REG_MISSING;
1696
1697   /* If the current node has constraints, duplicate all nodes
1698      since they must inherit the constraints.  */
1699   if (dfa->nodes[node].constraint
1700       && dfa->edests[node].nelem
1701       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1702     {
1703       err = duplicate_node_closure (dfa, node, node, node,
1704                                     dfa->nodes[node].constraint);
1705       if (BE (err != REG_NOERROR, 0))
1706         return err;
1707     }
1708
1709   /* Expand each epsilon destination nodes.  */
1710   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1711     for (i = 0; i < dfa->edests[node].nelem; ++i)
1712       {
1713         re_node_set eclosure_elem;
1714         Idx edest = dfa->edests[node].elems[i];
1715         /* If calculating the epsilon closure of `edest' is in progress,
1716            return intermediate result.  */
1717         if (dfa->eclosures[edest].nelem == REG_MISSING)
1718           {
1719             incomplete = true;
1720             continue;
1721           }
1722         /* If we haven't calculated the epsilon closure of `edest' yet,
1723            calculate now. Otherwise use calculated epsilon closure.  */
1724         if (dfa->eclosures[edest].nelem == 0)
1725           {
1726             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1727             if (BE (err != REG_NOERROR, 0))
1728               return err;
1729           }
1730         else
1731           eclosure_elem = dfa->eclosures[edest];
1732         /* Merge the epsilon closure of `edest'.  */
1733         err = re_node_set_merge (&eclosure, &eclosure_elem);
1734         if (BE (err != REG_NOERROR, 0))
1735           return err;
1736         /* If the epsilon closure of `edest' is incomplete,
1737            the epsilon closure of this node is also incomplete.  */
1738         if (dfa->eclosures[edest].nelem == 0)
1739           {
1740             incomplete = true;
1741             re_node_set_free (&eclosure_elem);
1742           }
1743       }
1744
1745   /* An epsilon closure includes itself.  */
1746   ok = re_node_set_insert (&eclosure, node);
1747   if (BE (! ok, 0))
1748     return REG_ESPACE;
1749   if (incomplete && !root)
1750     dfa->eclosures[node].nelem = 0;
1751   else
1752     dfa->eclosures[node] = eclosure;
1753   *new_set = eclosure;
1754   return REG_NOERROR;
1755 }
1756 \f
1757 /* Functions for token which are used in the parser.  */
1758
1759 /* Fetch a token from INPUT.
1760    We must not use this function inside bracket expressions.  */
1761
1762 static void
1763 internal_function
1764 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1765 {
1766   re_string_skip_bytes (input, peek_token (result, input, syntax));
1767 }
1768
1769 /* Peek a token from INPUT, and return the length of the token.
1770    We must not use this function inside bracket expressions.  */
1771
1772 static int
1773 internal_function
1774 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1775 {
1776   unsigned char c;
1777
1778   if (re_string_eoi (input))
1779     {
1780       token->type = END_OF_RE;
1781       return 0;
1782     }
1783
1784   c = re_string_peek_byte (input, 0);
1785   token->opr.c = c;
1786
1787   token->word_char = 0;
1788 #ifdef RE_ENABLE_I18N
1789   token->mb_partial = 0;
1790   if (input->mb_cur_max > 1 &&
1791       !re_string_first_byte (input, re_string_cur_idx (input)))
1792     {
1793       token->type = CHARACTER;
1794       token->mb_partial = 1;
1795       return 1;
1796     }
1797 #endif
1798   if (c == '\\')
1799     {
1800       unsigned char c2;
1801       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1802         {
1803           token->type = BACK_SLASH;
1804           return 1;
1805         }
1806
1807       c2 = re_string_peek_byte_case (input, 1);
1808       token->opr.c = c2;
1809       token->type = CHARACTER;
1810 #ifdef RE_ENABLE_I18N
1811       if (input->mb_cur_max > 1)
1812         {
1813           wint_t wc = re_string_wchar_at (input,
1814                                           re_string_cur_idx (input) + 1);
1815           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1816         }
1817       else
1818 #endif
1819         token->word_char = IS_WORD_CHAR (c2) != 0;
1820
1821       switch (c2)
1822         {
1823         case '|':
1824           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1825             token->type = OP_ALT;
1826           break;
1827         case '1': case '2': case '3': case '4': case '5':
1828         case '6': case '7': case '8': case '9':
1829           if (!(syntax & RE_NO_BK_REFS))
1830             {
1831               token->type = OP_BACK_REF;
1832               token->opr.idx = c2 - '1';
1833             }
1834           break;
1835         case '<':
1836           if (!(syntax & RE_NO_GNU_OPS))
1837             {
1838               token->type = ANCHOR;
1839               token->opr.ctx_type = WORD_FIRST;
1840             }
1841           break;
1842         case '>':
1843           if (!(syntax & RE_NO_GNU_OPS))
1844             {
1845               token->type = ANCHOR;
1846               token->opr.ctx_type = WORD_LAST;
1847             }
1848           break;
1849         case 'b':
1850           if (!(syntax & RE_NO_GNU_OPS))
1851             {
1852               token->type = ANCHOR;
1853               token->opr.ctx_type = WORD_DELIM;
1854             }
1855           break;
1856         case 'B':
1857           if (!(syntax & RE_NO_GNU_OPS))
1858             {
1859               token->type = ANCHOR;
1860               token->opr.ctx_type = NOT_WORD_DELIM;
1861             }
1862           break;
1863         case 'w':
1864           if (!(syntax & RE_NO_GNU_OPS))
1865             token->type = OP_WORD;
1866           break;
1867         case 'W':
1868           if (!(syntax & RE_NO_GNU_OPS))
1869             token->type = OP_NOTWORD;
1870           break;
1871         case 's':
1872           if (!(syntax & RE_NO_GNU_OPS))
1873             token->type = OP_SPACE;
1874           break;
1875         case 'S':
1876           if (!(syntax & RE_NO_GNU_OPS))
1877             token->type = OP_NOTSPACE;
1878           break;
1879         case '`':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             {
1882               token->type = ANCHOR;
1883               token->opr.ctx_type = BUF_FIRST;
1884             }
1885           break;
1886         case '\'':
1887           if (!(syntax & RE_NO_GNU_OPS))
1888             {
1889               token->type = ANCHOR;
1890               token->opr.ctx_type = BUF_LAST;
1891             }
1892           break;
1893         case '(':
1894           if (!(syntax & RE_NO_BK_PARENS))
1895             token->type = OP_OPEN_SUBEXP;
1896           break;
1897         case ')':
1898           if (!(syntax & RE_NO_BK_PARENS))
1899             token->type = OP_CLOSE_SUBEXP;
1900           break;
1901         case '+':
1902           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1903             token->type = OP_DUP_PLUS;
1904           break;
1905         case '?':
1906           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907             token->type = OP_DUP_QUESTION;
1908           break;
1909         case '{':
1910           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1911             token->type = OP_OPEN_DUP_NUM;
1912           break;
1913         case '}':
1914           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915             token->type = OP_CLOSE_DUP_NUM;
1916           break;
1917         default:
1918           break;
1919         }
1920       return 2;
1921     }
1922
1923   token->type = CHARACTER;
1924 #ifdef RE_ENABLE_I18N
1925   if (input->mb_cur_max > 1)
1926     {
1927       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1928       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1929     }
1930   else
1931 #endif
1932     token->word_char = IS_WORD_CHAR (token->opr.c);
1933
1934   switch (c)
1935     {
1936     case '\n':
1937       if (syntax & RE_NEWLINE_ALT)
1938         token->type = OP_ALT;
1939       break;
1940     case '|':
1941       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1942         token->type = OP_ALT;
1943       break;
1944     case '*':
1945       token->type = OP_DUP_ASTERISK;
1946       break;
1947     case '+':
1948       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1949         token->type = OP_DUP_PLUS;
1950       break;
1951     case '?':
1952       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953         token->type = OP_DUP_QUESTION;
1954       break;
1955     case '{':
1956       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1957         token->type = OP_OPEN_DUP_NUM;
1958       break;
1959     case '}':
1960       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961         token->type = OP_CLOSE_DUP_NUM;
1962       break;
1963     case '(':
1964       if (syntax & RE_NO_BK_PARENS)
1965         token->type = OP_OPEN_SUBEXP;
1966       break;
1967     case ')':
1968       if (syntax & RE_NO_BK_PARENS)
1969         token->type = OP_CLOSE_SUBEXP;
1970       break;
1971     case '[':
1972       token->type = OP_OPEN_BRACKET;
1973       break;
1974     case '.':
1975       token->type = OP_PERIOD;
1976       break;
1977     case '^':
1978       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1979           re_string_cur_idx (input) != 0)
1980         {
1981           char prev = re_string_peek_byte (input, -1);
1982           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1983             break;
1984         }
1985       token->type = ANCHOR;
1986       token->opr.ctx_type = LINE_FIRST;
1987       break;
1988     case '$':
1989       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1990           re_string_cur_idx (input) + 1 != re_string_length (input))
1991         {
1992           re_token_t next;
1993           re_string_skip_bytes (input, 1);
1994           peek_token (&next, input, syntax);
1995           re_string_skip_bytes (input, -1);
1996           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1997             break;
1998         }
1999       token->type = ANCHOR;
2000       token->opr.ctx_type = LINE_LAST;
2001       break;
2002     default:
2003       break;
2004     }
2005   return 1;
2006 }
2007
2008 /* Peek a token from INPUT, and return the length of the token.
2009    We must not use this function out of bracket expressions.  */
2010
2011 static int
2012 internal_function
2013 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2014 {
2015   unsigned char c;
2016   if (re_string_eoi (input))
2017     {
2018       token->type = END_OF_RE;
2019       return 0;
2020     }
2021   c = re_string_peek_byte (input, 0);
2022   token->opr.c = c;
2023
2024 #ifdef RE_ENABLE_I18N
2025   if (input->mb_cur_max > 1 &&
2026       !re_string_first_byte (input, re_string_cur_idx (input)))
2027     {
2028       token->type = CHARACTER;
2029       return 1;
2030     }
2031 #endif /* RE_ENABLE_I18N */
2032
2033   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2034       && re_string_cur_idx (input) + 1 < re_string_length (input))
2035     {
2036       /* In this case, '\' escape a character.  */
2037       unsigned char c2;
2038       re_string_skip_bytes (input, 1);
2039       c2 = re_string_peek_byte (input, 0);
2040       token->opr.c = c2;
2041       token->type = CHARACTER;
2042       return 1;
2043     }
2044   if (c == '[') /* '[' is a special char in a bracket exps.  */
2045     {
2046       unsigned char c2;
2047       int token_len;
2048       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2049         c2 = re_string_peek_byte (input, 1);
2050       else
2051         c2 = 0;
2052       token->opr.c = c2;
2053       token_len = 2;
2054       switch (c2)
2055         {
2056         case '.':
2057           token->type = OP_OPEN_COLL_ELEM;
2058           break;
2059         case '=':
2060           token->type = OP_OPEN_EQUIV_CLASS;
2061           break;
2062         case ':':
2063           if (syntax & RE_CHAR_CLASSES)
2064             {
2065               token->type = OP_OPEN_CHAR_CLASS;
2066               break;
2067             }
2068           /* else fall through.  */
2069         default:
2070           token->type = CHARACTER;
2071           token->opr.c = c;
2072           token_len = 1;
2073           break;
2074         }
2075       return token_len;
2076     }
2077   switch (c)
2078     {
2079     case '-':
2080       token->type = OP_CHARSET_RANGE;
2081       break;
2082     case ']':
2083       token->type = OP_CLOSE_BRACKET;
2084       break;
2085     case '^':
2086       token->type = OP_NON_MATCH_LIST;
2087       break;
2088     default:
2089       token->type = CHARACTER;
2090     }
2091   return 1;
2092 }
2093 \f
2094 /* Functions for parser.  */
2095
2096 /* Entry point of the parser.
2097    Parse the regular expression REGEXP and return the structure tree.
2098    If an error is occured, ERR is set by error code, and return NULL.
2099    This function build the following tree, from regular expression <reg_exp>:
2100            CAT
2101            / \
2102           /   \
2103    <reg_exp>  EOR
2104
2105    CAT means concatenation.
2106    EOR means end of regular expression.  */
2107
2108 static bin_tree_t *
2109 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2110        reg_errcode_t *err)
2111 {
2112   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113   bin_tree_t *tree, *eor, *root;
2114   re_token_t current_token;
2115   dfa->syntax = syntax;
2116   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2117   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2118   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2119     return NULL;
2120   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2121   if (tree != NULL)
2122     root = create_tree (dfa, tree, eor, CONCAT);
2123   else
2124     root = eor;
2125   if (BE (eor == NULL || root == NULL, 0))
2126     {
2127       *err = REG_ESPACE;
2128       return NULL;
2129     }
2130   return root;
2131 }
2132
2133 /* This function build the following tree, from regular expression
2134    <branch1>|<branch2>:
2135            ALT
2136            / \
2137           /   \
2138    <branch1> <branch2>
2139
2140    ALT means alternative, which represents the operator `|'.  */
2141
2142 static bin_tree_t *
2143 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2144                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2145 {
2146   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147   bin_tree_t *tree, *branch = NULL;
2148   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2149   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2150     return NULL;
2151
2152   while (token->type == OP_ALT)
2153     {
2154       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155       if (token->type != OP_ALT && token->type != END_OF_RE
2156           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2157         {
2158           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2159           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2160             return NULL;
2161         }
2162       else
2163         branch = NULL;
2164       tree = create_tree (dfa, tree, branch, OP_ALT);
2165       if (BE (tree == NULL, 0))
2166         {
2167           *err = REG_ESPACE;
2168           return NULL;
2169         }
2170     }
2171   return tree;
2172 }
2173
2174 /* This function build the following tree, from regular expression
2175    <exp1><exp2>:
2176         CAT
2177         / \
2178        /   \
2179    <exp1> <exp2>
2180
2181    CAT means concatenation.  */
2182
2183 static bin_tree_t *
2184 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2185               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2186 {
2187   bin_tree_t *tree, *expr;
2188   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2189   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2190   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2191     return NULL;
2192
2193   while (token->type != OP_ALT && token->type != END_OF_RE
2194          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2195     {
2196       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2197       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2198         {
2199           return NULL;
2200         }
2201       if (tree != NULL && expr != NULL)
2202         {
2203           tree = create_tree (dfa, tree, expr, CONCAT);
2204           if (tree == NULL)
2205             {
2206               *err = REG_ESPACE;
2207               return NULL;
2208             }
2209         }
2210       else if (tree == NULL)
2211         tree = expr;
2212       /* Otherwise expr == NULL, we don't need to create new tree.  */
2213     }
2214   return tree;
2215 }
2216
2217 /* This function build the following tree, from regular expression a*:
2218          *
2219          |
2220          a
2221 */
2222
2223 static bin_tree_t *
2224 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2226 {
2227   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228   bin_tree_t *tree;
2229   switch (token->type)
2230     {
2231     case CHARACTER:
2232       tree = create_token_tree (dfa, NULL, NULL, token);
2233       if (BE (tree == NULL, 0))
2234         {
2235           *err = REG_ESPACE;
2236           return NULL;
2237         }
2238 #ifdef RE_ENABLE_I18N
2239       if (dfa->mb_cur_max > 1)
2240         {
2241           while (!re_string_eoi (regexp)
2242                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2243             {
2244               bin_tree_t *mbc_remain;
2245               fetch_token (token, regexp, syntax);
2246               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248               if (BE (mbc_remain == NULL || tree == NULL, 0))
2249                 {
2250                   *err = REG_ESPACE;
2251                   return NULL;
2252                 }
2253             }
2254         }
2255 #endif
2256       break;
2257     case OP_OPEN_SUBEXP:
2258       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260         return NULL;
2261       break;
2262     case OP_OPEN_BRACKET:
2263       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265         return NULL;
2266       break;
2267     case OP_BACK_REF:
2268       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2269         {
2270           *err = REG_ESUBREG;
2271           return NULL;
2272         }
2273       dfa->used_bkref_map |= 1 << token->opr.idx;
2274       tree = create_token_tree (dfa, NULL, NULL, token);
2275       if (BE (tree == NULL, 0))
2276         {
2277           *err = REG_ESPACE;
2278           return NULL;
2279         }
2280       ++dfa->nbackref;
2281       dfa->has_mb_node = 1;
2282       break;
2283     case OP_OPEN_DUP_NUM:
2284       if (syntax & RE_CONTEXT_INVALID_DUP)
2285         {
2286           *err = REG_BADRPT;
2287           return NULL;
2288         }
2289       /* FALLTHROUGH */
2290     case OP_DUP_ASTERISK:
2291     case OP_DUP_PLUS:
2292     case OP_DUP_QUESTION:
2293       if (syntax & RE_CONTEXT_INVALID_OPS)
2294         {
2295           *err = REG_BADRPT;
2296           return NULL;
2297         }
2298       else if (syntax & RE_CONTEXT_INDEP_OPS)
2299         {
2300           fetch_token (token, regexp, syntax);
2301           return parse_expression (regexp, preg, token, syntax, nest, err);
2302         }
2303       /* else fall through  */
2304     case OP_CLOSE_SUBEXP:
2305       if ((token->type == OP_CLOSE_SUBEXP) &&
2306           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2307         {
2308           *err = REG_ERPAREN;
2309           return NULL;
2310         }
2311       /* else fall through  */
2312     case OP_CLOSE_DUP_NUM:
2313       /* We treat it as a normal character.  */
2314
2315       /* Then we can these characters as normal characters.  */
2316       token->type = CHARACTER;
2317       /* mb_partial and word_char bits should be initialized already
2318          by peek_token.  */
2319       tree = create_token_tree (dfa, NULL, NULL, token);
2320       if (BE (tree == NULL, 0))
2321         {
2322           *err = REG_ESPACE;
2323           return NULL;
2324         }
2325       break;
2326     case ANCHOR:
2327       if ((token->opr.ctx_type
2328            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329           && dfa->word_ops_used == 0)
2330         init_word_char (dfa);
2331       if (token->opr.ctx_type == WORD_DELIM
2332           || token->opr.ctx_type == NOT_WORD_DELIM)
2333         {
2334           bin_tree_t *tree_first, *tree_last;
2335           if (token->opr.ctx_type == WORD_DELIM)
2336             {
2337               token->opr.ctx_type = WORD_FIRST;
2338               tree_first = create_token_tree (dfa, NULL, NULL, token);
2339               token->opr.ctx_type = WORD_LAST;
2340             }
2341           else
2342             {
2343               token->opr.ctx_type = INSIDE_WORD;
2344               tree_first = create_token_tree (dfa, NULL, NULL, token);
2345               token->opr.ctx_type = INSIDE_NOTWORD;
2346             }
2347           tree_last = create_token_tree (dfa, NULL, NULL, token);
2348           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2350             {
2351               *err = REG_ESPACE;
2352               return NULL;
2353             }
2354         }
2355       else
2356         {
2357           tree = create_token_tree (dfa, NULL, NULL, token);
2358           if (BE (tree == NULL, 0))
2359             {
2360               *err = REG_ESPACE;
2361               return NULL;
2362             }
2363         }
2364       /* We must return here, since ANCHORs can't be followed
2365          by repetition operators.
2366          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2368       fetch_token (token, regexp, syntax);
2369       return tree;
2370     case OP_PERIOD:
2371       tree = create_token_tree (dfa, NULL, NULL, token);
2372       if (BE (tree == NULL, 0))
2373         {
2374           *err = REG_ESPACE;
2375           return NULL;
2376         }
2377       if (dfa->mb_cur_max > 1)
2378         dfa->has_mb_node = 1;
2379       break;
2380     case OP_WORD:
2381     case OP_NOTWORD:
2382       tree = build_charclass_op (dfa, regexp->trans,
2383                                  (const unsigned char *) "alnum",
2384                                  (const unsigned char *) "_",
2385                                  token->type == OP_NOTWORD, err);
2386       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387         return NULL;
2388       break;
2389     case OP_SPACE:
2390     case OP_NOTSPACE:
2391       tree = build_charclass_op (dfa, regexp->trans,
2392                                  (const unsigned char *) "space",
2393                                  (const unsigned char *) "",
2394                                  token->type == OP_NOTSPACE, err);
2395       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396         return NULL;
2397       break;
2398     case OP_ALT:
2399     case END_OF_RE:
2400       return NULL;
2401     case BACK_SLASH:
2402       *err = REG_EESCAPE;
2403       return NULL;
2404     default:
2405       /* Must not happen?  */
2406 #ifdef DEBUG
2407       assert (0);
2408 #endif
2409       return NULL;
2410     }
2411   fetch_token (token, regexp, syntax);
2412
2413   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2415     {
2416       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418         return NULL;
2419       /* In BRE consecutive duplications are not allowed.  */
2420       if ((syntax & RE_CONTEXT_INVALID_DUP)
2421           && (token->type == OP_DUP_ASTERISK
2422               || token->type == OP_OPEN_DUP_NUM))
2423         {
2424           *err = REG_BADRPT;
2425           return NULL;
2426         }
2427     }
2428
2429   return tree;
2430 }
2431
2432 /* This function build the following tree, from regular expression
2433    (<reg_exp>):
2434          SUBEXP
2435             |
2436         <reg_exp>
2437 */
2438
2439 static bin_tree_t *
2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2442 {
2443   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444   bin_tree_t *tree;
2445   size_t cur_nsub;
2446   cur_nsub = preg->re_nsub++;
2447
2448   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2449
2450   /* The subexpression may be a null string.  */
2451   if (token->type == OP_CLOSE_SUBEXP)
2452     tree = NULL;
2453   else
2454     {
2455       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457         *err = REG_EPAREN;
2458       if (BE (*err != REG_NOERROR, 0))
2459         return NULL;
2460     }
2461
2462   if (cur_nsub <= '9' - '1')
2463     dfa->completed_bkref_map |= 1 << cur_nsub;
2464
2465   tree = create_tree (dfa, tree, NULL, SUBEXP);
2466   if (BE (tree == NULL, 0))
2467     {
2468       *err = REG_ESPACE;
2469       return NULL;
2470     }
2471   tree->token.opr.idx = cur_nsub;
2472   return tree;
2473 }
2474
2475 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2476
2477 static bin_tree_t *
2478 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2479               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2480 {
2481   bin_tree_t *tree = NULL, *old_tree = NULL;
2482   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2483   re_token_t start_token = *token;
2484
2485   if (token->type == OP_OPEN_DUP_NUM)
2486     {
2487       end = 0;
2488       start = fetch_number (regexp, token, syntax);
2489       if (start == REG_MISSING)
2490         {
2491           if (token->type == CHARACTER && token->opr.c == ',')
2492             start = 0; /* We treat "{,m}" as "{0,m}".  */
2493           else
2494             {
2495               *err = REG_BADBR; /* <re>{} is invalid.  */
2496               return NULL;
2497             }
2498         }
2499       if (BE (start != REG_ERROR, 1))
2500         {
2501           /* We treat "{n}" as "{n,n}".  */
2502           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2503                  : ((token->type == CHARACTER && token->opr.c == ',')
2504                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2505         }
2506       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2507         {
2508           /* Invalid sequence.  */
2509           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2510             {
2511               if (token->type == END_OF_RE)
2512                 *err = REG_EBRACE;
2513               else
2514                 *err = REG_BADBR;
2515
2516               return NULL;
2517             }
2518
2519           /* If the syntax bit is set, rollback.  */
2520           re_string_set_index (regexp, start_idx);
2521           *token = start_token;
2522           token->type = CHARACTER;
2523           /* mb_partial and word_char bits should be already initialized by
2524              peek_token.  */
2525           return elem;
2526         }
2527
2528       if (BE ((end != REG_MISSING && start > end)
2529               || token->type != OP_CLOSE_DUP_NUM, 0))
2530         {
2531           /* First number greater than second.  */
2532           *err = REG_BADBR;
2533           return NULL;
2534         }
2535     }
2536   else
2537     {
2538       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2539       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2540     }
2541
2542   fetch_token (token, regexp, syntax);
2543
2544   if (BE (elem == NULL, 0))
2545     return NULL;
2546   if (BE (start == 0 && end == 0, 0))
2547     {
2548       postorder (elem, free_tree, NULL);
2549       return NULL;
2550     }
2551
2552   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2553   if (BE (start > 0, 0))
2554     {
2555       tree = elem;
2556       for (i = 2; i <= start; ++i)
2557         {
2558           elem = duplicate_tree (elem, dfa);
2559           tree = create_tree (dfa, tree, elem, CONCAT);
2560           if (BE (elem == NULL || tree == NULL, 0))
2561             goto parse_dup_op_espace;
2562         }
2563
2564       if (start == end)
2565         return tree;
2566
2567       /* Duplicate ELEM before it is marked optional.  */
2568       elem = duplicate_tree (elem, dfa);
2569       old_tree = tree;
2570     }
2571   else
2572     old_tree = NULL;
2573
2574   if (elem->token.type == SUBEXP)
2575     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2576
2577   tree = create_tree (dfa, elem, NULL,
2578                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2579   if (BE (tree == NULL, 0))
2580     goto parse_dup_op_espace;
2581
2582 /* From gnulib's "intprops.h":
2583    True if the arithmetic type T is signed.  */
2584 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2585
2586   /* This loop is actually executed only when end != REG_MISSING,
2587      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2588      already created the start+1-th copy.  */
2589   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2590     for (i = start + 2; i <= end; ++i)
2591       {
2592         elem = duplicate_tree (elem, dfa);
2593         tree = create_tree (dfa, tree, elem, CONCAT);
2594         if (BE (elem == NULL || tree == NULL, 0))
2595           goto parse_dup_op_espace;
2596
2597         tree = create_tree (dfa, tree, NULL, OP_ALT);
2598         if (BE (tree == NULL, 0))
2599           goto parse_dup_op_espace;
2600       }
2601
2602   if (old_tree)
2603     tree = create_tree (dfa, old_tree, tree, CONCAT);
2604
2605   return tree;
2606
2607  parse_dup_op_espace:
2608   *err = REG_ESPACE;
2609   return NULL;
2610 }
2611
2612 /* Size of the names for collating symbol/equivalence_class/character_class.
2613    I'm not sure, but maybe enough.  */
2614 #define BRACKET_NAME_BUF_SIZE 32
2615
2616 #ifndef _LIBC
2617   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2618      Build the range expression which starts from START_ELEM, and ends
2619      at END_ELEM.  The result are written to MBCSET and SBCSET.
2620      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2621      mbcset->range_ends, is a pointer argument sinse we may
2622      update it.  */
2623
2624 static reg_errcode_t
2625 internal_function
2626 # ifdef RE_ENABLE_I18N
2627 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2628                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2629 # else /* not RE_ENABLE_I18N */
2630 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2631                  bracket_elem_t *end_elem)
2632 # endif /* not RE_ENABLE_I18N */
2633 {
2634   unsigned int start_ch, end_ch;
2635   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2636   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2637           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2638           0))
2639     return REG_ERANGE;
2640
2641   /* We can handle no multi character collating elements without libc
2642      support.  */
2643   if (BE ((start_elem->type == COLL_SYM
2644            && strlen ((char *) start_elem->opr.name) > 1)
2645           || (end_elem->type == COLL_SYM
2646               && strlen ((char *) end_elem->opr.name) > 1), 0))
2647     return REG_ECOLLATE;
2648
2649 # ifdef RE_ENABLE_I18N
2650   {
2651     wchar_t wc;
2652     wint_t start_wc;
2653     wint_t end_wc;
2654     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2655
2656     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2657                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2658                    : 0));
2659     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2660               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2661                  : 0));
2662     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2663                 ? __btowc (start_ch) : start_elem->opr.wch);
2664     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2665               ? __btowc (end_ch) : end_elem->opr.wch);
2666     if (start_wc == WEOF || end_wc == WEOF)
2667       return REG_ECOLLATE;
2668     cmp_buf[0] = start_wc;
2669     cmp_buf[4] = end_wc;
2670     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2671       return REG_ERANGE;
2672
2673     /* Got valid collation sequence values, add them as a new entry.
2674        However, for !_LIBC we have no collation elements: if the
2675        character set is single byte, the single byte character set
2676        that we build below suffices.  parse_bracket_exp passes
2677        no MBCSET if dfa->mb_cur_max == 1.  */
2678     if (mbcset)
2679       {
2680         /* Check the space of the arrays.  */
2681         if (BE (*range_alloc == mbcset->nranges, 0))
2682           {
2683             /* There is not enough space, need realloc.  */
2684             wchar_t *new_array_start, *new_array_end;
2685             Idx new_nranges;
2686
2687             /* +1 in case of mbcset->nranges is 0.  */
2688             new_nranges = 2 * mbcset->nranges + 1;
2689             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2690                are NULL if *range_alloc == 0.  */
2691             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2692                                           new_nranges);
2693             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2694                                         new_nranges);
2695
2696             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2697               return REG_ESPACE;
2698
2699             mbcset->range_starts = new_array_start;
2700             mbcset->range_ends = new_array_end;
2701             *range_alloc = new_nranges;
2702           }
2703
2704         mbcset->range_starts[mbcset->nranges] = start_wc;
2705         mbcset->range_ends[mbcset->nranges++] = end_wc;
2706       }
2707
2708     /* Build the table for single byte characters.  */
2709     for (wc = 0; wc < SBC_MAX; ++wc)
2710       {
2711         cmp_buf[2] = wc;
2712         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2713             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2714           bitset_set (sbcset, wc);
2715       }
2716   }
2717 # else /* not RE_ENABLE_I18N */
2718   {
2719     unsigned int ch;
2720     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2721                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2722                    : 0));
2723     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2724               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2725                  : 0));
2726     if (start_ch > end_ch)
2727       return REG_ERANGE;
2728     /* Build the table for single byte characters.  */
2729     for (ch = 0; ch < SBC_MAX; ++ch)
2730       if (start_ch <= ch  && ch <= end_ch)
2731         bitset_set (sbcset, ch);
2732   }
2733 # endif /* not RE_ENABLE_I18N */
2734   return REG_NOERROR;
2735 }
2736 #endif /* not _LIBC */
2737
2738 #ifndef _LIBC
2739 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2740    Build the collating element which is represented by NAME.
2741    The result are written to MBCSET and SBCSET.
2742    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2743    pointer argument since we may update it.  */
2744
2745 static reg_errcode_t
2746 internal_function
2747 build_collating_symbol (bitset_t sbcset,
2748 # ifdef RE_ENABLE_I18N
2749                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2750 # endif
2751                         const unsigned char *name)
2752 {
2753   size_t name_len = strlen ((const char *) name);
2754   if (BE (name_len != 1, 0))
2755     return REG_ECOLLATE;
2756   else
2757     {
2758       bitset_set (sbcset, name[0]);
2759       return REG_NOERROR;
2760     }
2761 }
2762 #endif /* not _LIBC */
2763
2764 /* This function parse bracket expression like "[abc]", "[a-c]",
2765    "[[.a-a.]]" etc.  */
2766
2767 static bin_tree_t *
2768 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2769                    reg_syntax_t syntax, reg_errcode_t *err)
2770 {
2771 #ifdef _LIBC
2772   const unsigned char *collseqmb;
2773   const char *collseqwc;
2774   uint32_t nrules;
2775   int32_t table_size;
2776   const int32_t *symb_table;
2777   const unsigned char *extra;
2778
2779   /* Local function for parse_bracket_exp used in _LIBC environement.
2780      Seek the collating symbol entry correspondings to NAME.
2781      Return the index of the symbol in the SYMB_TABLE.  */
2782
2783   auto inline int32_t
2784   __attribute ((always_inline))
2785   seek_collating_symbol_entry (name, name_len)
2786          const unsigned char *name;
2787          size_t name_len;
2788     {
2789       int32_t hash = elem_hash ((const char *) name, name_len);
2790       int32_t elem = hash % table_size;
2791       if (symb_table[2 * elem] != 0)
2792         {
2793           int32_t second = hash % (table_size - 2) + 1;
2794
2795           do
2796             {
2797               /* First compare the hashing value.  */
2798               if (symb_table[2 * elem] == hash
2799                   /* Compare the length of the name.  */
2800                   && name_len == extra[symb_table[2 * elem + 1]]
2801                   /* Compare the name.  */
2802                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2803                              name_len) == 0)
2804                 {
2805                   /* Yep, this is the entry.  */
2806                   break;
2807                 }
2808
2809               /* Next entry.  */
2810               elem += second;
2811             }
2812           while (symb_table[2 * elem] != 0);
2813         }
2814       return elem;
2815     }
2816
2817   /* Local function for parse_bracket_exp used in _LIBC environment.
2818      Look up the collation sequence value of BR_ELEM.
2819      Return the value if succeeded, UINT_MAX otherwise.  */
2820
2821   auto inline unsigned int
2822   __attribute ((always_inline))
2823   lookup_collation_sequence_value (br_elem)
2824          bracket_elem_t *br_elem;
2825     {
2826       if (br_elem->type == SB_CHAR)
2827         {
2828           /*
2829           if (MB_CUR_MAX == 1)
2830           */
2831           if (nrules == 0)
2832             return collseqmb[br_elem->opr.ch];
2833           else
2834             {
2835               wint_t wc = __btowc (br_elem->opr.ch);
2836               return __collseq_table_lookup (collseqwc, wc);
2837             }
2838         }
2839       else if (br_elem->type == MB_CHAR)
2840         {
2841           if (nrules != 0)
2842             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2843         }
2844       else if (br_elem->type == COLL_SYM)
2845         {
2846           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2847           if (nrules != 0)
2848             {
2849               int32_t elem, idx;
2850               elem = seek_collating_symbol_entry (br_elem->opr.name,
2851                                                   sym_name_len);
2852               if (symb_table[2 * elem] != 0)
2853                 {
2854                   /* We found the entry.  */
2855                   idx = symb_table[2 * elem + 1];
2856                   /* Skip the name of collating element name.  */
2857                   idx += 1 + extra[idx];
2858                   /* Skip the byte sequence of the collating element.  */
2859                   idx += 1 + extra[idx];
2860                   /* Adjust for the alignment.  */
2861                   idx = (idx + 3) & ~3;
2862                   /* Skip the multibyte collation sequence value.  */
2863                   idx += sizeof (unsigned int);
2864                   /* Skip the wide char sequence of the collating element.  */
2865                   idx += sizeof (unsigned int) *
2866                     (1 + *(unsigned int *) (extra + idx));
2867                   /* Return the collation sequence value.  */
2868                   return *(unsigned int *) (extra + idx);
2869                 }
2870               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2871                 {
2872                   /* No valid character.  Match it as a single byte
2873                      character.  */
2874                   return collseqmb[br_elem->opr.name[0]];
2875                 }
2876             }
2877           else if (sym_name_len == 1)
2878             return collseqmb[br_elem->opr.name[0]];
2879         }
2880       return UINT_MAX;
2881     }
2882
2883   /* Local function for parse_bracket_exp used in _LIBC environement.
2884      Build the range expression which starts from START_ELEM, and ends
2885      at END_ELEM.  The result are written to MBCSET and SBCSET.
2886      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2887      mbcset->range_ends, is a pointer argument sinse we may
2888      update it.  */
2889
2890   auto inline reg_errcode_t
2891   __attribute ((always_inline))
2892   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2893          re_charset_t *mbcset;
2894          Idx *range_alloc;
2895          bitset_t sbcset;
2896          bracket_elem_t *start_elem, *end_elem;
2897     {
2898       unsigned int ch;
2899       uint32_t start_collseq;
2900       uint32_t end_collseq;
2901
2902       /* Equivalence Classes and Character Classes can't be a range
2903          start/end.  */
2904       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2905               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2906               0))
2907         return REG_ERANGE;
2908
2909       start_collseq = lookup_collation_sequence_value (start_elem);
2910       end_collseq = lookup_collation_sequence_value (end_elem);
2911       /* Check start/end collation sequence values.  */
2912       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2913         return REG_ECOLLATE;
2914       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2915         return REG_ERANGE;
2916
2917       /* Got valid collation sequence values, add them as a new entry.
2918          However, if we have no collation elements, and the character set
2919          is single byte, the single byte character set that we
2920          build below suffices. */
2921       if (nrules > 0 || dfa->mb_cur_max > 1)
2922         {
2923           /* Check the space of the arrays.  */
2924           if (BE (*range_alloc == mbcset->nranges, 0))
2925             {
2926               /* There is not enough space, need realloc.  */
2927               uint32_t *new_array_start;
2928               uint32_t *new_array_end;
2929               Idx new_nranges;
2930
2931               /* +1 in case of mbcset->nranges is 0.  */
2932               new_nranges = 2 * mbcset->nranges + 1;
2933               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2934                                             new_nranges);
2935               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2936                                           new_nranges);
2937
2938               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2939                 return REG_ESPACE;
2940
2941               mbcset->range_starts = new_array_start;
2942               mbcset->range_ends = new_array_end;
2943               *range_alloc = new_nranges;
2944             }
2945
2946           mbcset->range_starts[mbcset->nranges] = start_collseq;
2947           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2948         }
2949
2950       /* Build the table for single byte characters.  */
2951       for (ch = 0; ch < SBC_MAX; ch++)
2952         {
2953           uint32_t ch_collseq;
2954           /*
2955           if (MB_CUR_MAX == 1)
2956           */
2957           if (nrules == 0)
2958             ch_collseq = collseqmb[ch];
2959           else
2960             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2961           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2962             bitset_set (sbcset, ch);
2963         }
2964       return REG_NOERROR;
2965     }
2966
2967   /* Local function for parse_bracket_exp used in _LIBC environement.
2968      Build the collating element which is represented by NAME.
2969      The result are written to MBCSET and SBCSET.
2970      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2971      pointer argument sinse we may update it.  */
2972
2973   auto inline reg_errcode_t
2974   __attribute ((always_inline))
2975   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2976          re_charset_t *mbcset;
2977          Idx *coll_sym_alloc;
2978          bitset_t sbcset;
2979          const unsigned char *name;
2980     {
2981       int32_t elem, idx;
2982       size_t name_len = strlen ((const char *) name);
2983       if (nrules != 0)
2984         {
2985           elem = seek_collating_symbol_entry (name, name_len);
2986           if (symb_table[2 * elem] != 0)
2987             {
2988               /* We found the entry.  */
2989               idx = symb_table[2 * elem + 1];
2990               /* Skip the name of collating element name.  */
2991               idx += 1 + extra[idx];
2992             }
2993           else if (symb_table[2 * elem] == 0 && name_len == 1)
2994             {
2995               /* No valid character, treat it as a normal
2996                  character.  */
2997               bitset_set (sbcset, name[0]);
2998               return REG_NOERROR;
2999             }
3000           else
3001             return REG_ECOLLATE;
3002
3003           /* Got valid collation sequence, add it as a new entry.  */
3004           /* Check the space of the arrays.  */
3005           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3006             {
3007               /* Not enough, realloc it.  */
3008               /* +1 in case of mbcset->ncoll_syms is 0.  */
3009               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3010               /* Use realloc since mbcset->coll_syms is NULL
3011                  if *alloc == 0.  */
3012               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3013                                                    new_coll_sym_alloc);
3014               if (BE (new_coll_syms == NULL, 0))
3015                 return REG_ESPACE;
3016               mbcset->coll_syms = new_coll_syms;
3017               *coll_sym_alloc = new_coll_sym_alloc;
3018             }
3019           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3020           return REG_NOERROR;
3021         }
3022       else
3023         {
3024           if (BE (name_len != 1, 0))
3025             return REG_ECOLLATE;
3026           else
3027             {
3028               bitset_set (sbcset, name[0]);
3029               return REG_NOERROR;
3030             }
3031         }
3032     }
3033 #endif
3034
3035   re_token_t br_token;
3036   re_bitset_ptr_t sbcset;
3037 #ifdef RE_ENABLE_I18N
3038   re_charset_t *mbcset;
3039   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3040   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3041 #endif /* not RE_ENABLE_I18N */
3042   bool non_match = false;
3043   bin_tree_t *work_tree;
3044   int token_len;
3045   bool first_round = true;
3046 #ifdef _LIBC
3047   collseqmb = (const unsigned char *)
3048     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3049   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3050   if (nrules)
3051     {
3052       /*
3053       if (MB_CUR_MAX > 1)
3054       */
3055       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3056       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3057       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3058                                                   _NL_COLLATE_SYMB_TABLEMB);
3059       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3060                                                    _NL_COLLATE_SYMB_EXTRAMB);
3061     }
3062 #endif
3063   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3064 #ifdef RE_ENABLE_I18N
3065   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3066 #endif /* RE_ENABLE_I18N */
3067 #ifdef RE_ENABLE_I18N
3068   if (BE (sbcset == NULL || mbcset == NULL, 0))
3069 #else
3070   if (BE (sbcset == NULL, 0))
3071 #endif /* RE_ENABLE_I18N */
3072     {
3073       *err = REG_ESPACE;
3074       return NULL;
3075     }
3076
3077   token_len = peek_token_bracket (token, regexp, syntax);
3078   if (BE (token->type == END_OF_RE, 0))
3079     {
3080       *err = REG_BADPAT;
3081       goto parse_bracket_exp_free_return;
3082     }
3083   if (token->type == OP_NON_MATCH_LIST)
3084     {
3085 #ifdef RE_ENABLE_I18N
3086       mbcset->non_match = 1;
3087 #endif /* not RE_ENABLE_I18N */
3088       non_match = true;
3089       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3090         bitset_set (sbcset, '\n');
3091       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3092       token_len = peek_token_bracket (token, regexp, syntax);
3093       if (BE (token->type == END_OF_RE, 0))
3094         {
3095           *err = REG_BADPAT;
3096           goto parse_bracket_exp_free_return;
3097         }
3098     }
3099
3100   /* We treat the first ']' as a normal character.  */
3101   if (token->type == OP_CLOSE_BRACKET)
3102     token->type = CHARACTER;
3103
3104   while (1)
3105     {
3106       bracket_elem_t start_elem, end_elem;
3107       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3108       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3109       reg_errcode_t ret;
3110       int token_len2 = 0;
3111       bool is_range_exp = false;
3112       re_token_t token2;
3113
3114       start_elem.opr.name = start_name_buf;
3115       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3116                                    syntax, first_round);
3117       if (BE (ret != REG_NOERROR, 0))
3118         {
3119           *err = ret;
3120           goto parse_bracket_exp_free_return;
3121         }
3122       first_round = false;
3123
3124       /* Get information about the next token.  We need it in any case.  */
3125       token_len = peek_token_bracket (token, regexp, syntax);
3126
3127       /* Do not check for ranges if we know they are not allowed.  */
3128       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3129         {
3130           if (BE (token->type == END_OF_RE, 0))
3131             {
3132               *err = REG_EBRACK;
3133               goto parse_bracket_exp_free_return;
3134             }
3135           if (token->type == OP_CHARSET_RANGE)
3136             {
3137               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3138               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3139               if (BE (token2.type == END_OF_RE, 0))
3140                 {
3141                   *err = REG_EBRACK;
3142                   goto parse_bracket_exp_free_return;
3143                 }
3144               if (token2.type == OP_CLOSE_BRACKET)
3145                 {
3146                   /* We treat the last '-' as a normal character.  */
3147                   re_string_skip_bytes (regexp, -token_len);
3148                   token->type = CHARACTER;
3149                 }
3150               else
3151                 is_range_exp = true;
3152             }
3153         }
3154
3155       if (is_range_exp == true)
3156         {
3157           end_elem.opr.name = end_name_buf;
3158           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3159                                        dfa, syntax, true);
3160           if (BE (ret != REG_NOERROR, 0))
3161             {
3162               *err = ret;
3163               goto parse_bracket_exp_free_return;
3164             }
3165
3166           token_len = peek_token_bracket (token, regexp, syntax);
3167
3168 #ifdef _LIBC
3169           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3170                                   &start_elem, &end_elem);
3171 #else
3172 # ifdef RE_ENABLE_I18N
3173           *err = build_range_exp (sbcset,
3174                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3175                                   &range_alloc, &start_elem, &end_elem);
3176 # else
3177           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3178 # endif
3179 #endif /* RE_ENABLE_I18N */
3180           if (BE (*err != REG_NOERROR, 0))
3181             goto parse_bracket_exp_free_return;
3182         }
3183       else
3184         {
3185           switch (start_elem.type)
3186             {
3187             case SB_CHAR:
3188               bitset_set (sbcset, start_elem.opr.ch);
3189               break;
3190 #ifdef RE_ENABLE_I18N
3191             case MB_CHAR:
3192               /* Check whether the array has enough space.  */
3193               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3194                 {
3195                   wchar_t *new_mbchars;
3196                   /* Not enough, realloc it.  */
3197                   /* +1 in case of mbcset->nmbchars is 0.  */
3198                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3199                   /* Use realloc since array is NULL if *alloc == 0.  */
3200                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3201                                             mbchar_alloc);
3202                   if (BE (new_mbchars == NULL, 0))
3203                     goto parse_bracket_exp_espace;
3204                   mbcset->mbchars = new_mbchars;
3205                 }
3206               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3207               break;
3208 #endif /* RE_ENABLE_I18N */
3209             case EQUIV_CLASS:
3210               *err = build_equiv_class (sbcset,
3211 #ifdef RE_ENABLE_I18N
3212                                         mbcset, &equiv_class_alloc,
3213 #endif /* RE_ENABLE_I18N */
3214                                         start_elem.opr.name);
3215               if (BE (*err != REG_NOERROR, 0))
3216                 goto parse_bracket_exp_free_return;
3217               break;
3218             case COLL_SYM:
3219               *err = build_collating_symbol (sbcset,
3220 #ifdef RE_ENABLE_I18N
3221                                              mbcset, &coll_sym_alloc,
3222 #endif /* RE_ENABLE_I18N */
3223                                              start_elem.opr.name);
3224               if (BE (*err != REG_NOERROR, 0))
3225                 goto parse_bracket_exp_free_return;
3226               break;
3227             case CHAR_CLASS:
3228               *err = build_charclass (regexp->trans, sbcset,
3229 #ifdef RE_ENABLE_I18N
3230                                       mbcset, &char_class_alloc,
3231 #endif /* RE_ENABLE_I18N */
3232                                       start_elem.opr.name, syntax);
3233               if (BE (*err != REG_NOERROR, 0))
3234                goto parse_bracket_exp_free_return;
3235               break;
3236             default:
3237               assert (0);
3238               break;
3239             }
3240         }
3241       if (BE (token->type == END_OF_RE, 0))
3242         {
3243           *err = REG_EBRACK;
3244           goto parse_bracket_exp_free_return;
3245         }
3246       if (token->type == OP_CLOSE_BRACKET)
3247         break;
3248     }
3249
3250   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3251
3252   /* If it is non-matching list.  */
3253   if (non_match)
3254     bitset_not (sbcset);
3255
3256 #ifdef RE_ENABLE_I18N
3257   /* Ensure only single byte characters are set.  */
3258   if (dfa->mb_cur_max > 1)
3259     bitset_mask (sbcset, dfa->sb_char);
3260
3261   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3262       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3263                                                      || mbcset->non_match)))
3264     {
3265       bin_tree_t *mbc_tree;
3266       int sbc_idx;
3267       /* Build a tree for complex bracket.  */
3268       dfa->has_mb_node = 1;
3269       br_token.type = COMPLEX_BRACKET;
3270       br_token.opr.mbcset = mbcset;
3271       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3272       if (BE (mbc_tree == NULL, 0))
3273         goto parse_bracket_exp_espace;
3274       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3275         if (sbcset[sbc_idx])
3276           break;
3277       /* If there are no bits set in sbcset, there is no point
3278          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3279       if (sbc_idx < BITSET_WORDS)
3280         {
3281           /* Build a tree for simple bracket.  */
3282           br_token.type = SIMPLE_BRACKET;
3283           br_token.opr.sbcset = sbcset;
3284           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3285           if (BE (work_tree == NULL, 0))
3286             goto parse_bracket_exp_espace;
3287
3288           /* Then join them by ALT node.  */
3289           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3290           if (BE (work_tree == NULL, 0))
3291             goto parse_bracket_exp_espace;
3292         }
3293       else
3294         {
3295           re_free (sbcset);
3296           work_tree = mbc_tree;
3297         }
3298     }
3299   else
3300 #endif /* not RE_ENABLE_I18N */
3301     {
3302 #ifdef RE_ENABLE_I18N
3303       free_charset (mbcset);
3304 #endif
3305       /* Build a tree for simple bracket.  */
3306       br_token.type = SIMPLE_BRACKET;
3307       br_token.opr.sbcset = sbcset;
3308       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3309       if (BE (work_tree == NULL, 0))
3310         goto parse_bracket_exp_espace;
3311     }
3312   return work_tree;
3313
3314  parse_bracket_exp_espace:
3315   *err = REG_ESPACE;
3316  parse_bracket_exp_free_return:
3317   re_free (sbcset);
3318 #ifdef RE_ENABLE_I18N
3319   free_charset (mbcset);
3320 #endif /* RE_ENABLE_I18N */
3321   return NULL;
3322 }
3323
3324 /* Parse an element in the bracket expression.  */
3325
3326 static reg_errcode_t
3327 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3328                        re_token_t *token, int token_len, re_dfa_t *dfa,
3329                        reg_syntax_t syntax, bool accept_hyphen)
3330 {
3331 #ifdef RE_ENABLE_I18N
3332   int cur_char_size;
3333   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3334   if (cur_char_size > 1)
3335     {
3336       elem->type = MB_CHAR;
3337       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3338       re_string_skip_bytes (regexp, cur_char_size);
3339       return REG_NOERROR;
3340     }
3341 #endif /* RE_ENABLE_I18N */
3342   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3343   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3344       || token->type == OP_OPEN_EQUIV_CLASS)
3345     return parse_bracket_symbol (elem, regexp, token);
3346   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3347     {
3348       /* A '-' must only appear as anything but a range indicator before
3349          the closing bracket.  Everything else is an error.  */
3350       re_token_t token2;
3351       (void) peek_token_bracket (&token2, regexp, syntax);
3352       if (token2.type != OP_CLOSE_BRACKET)
3353         /* The actual error value is not standardized since this whole
3354            case is undefined.  But ERANGE makes good sense.  */
3355         return REG_ERANGE;
3356     }
3357   elem->type = SB_CHAR;
3358   elem->opr.ch = token->opr.c;
3359   return REG_NOERROR;
3360 }
3361
3362 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3363    such as [:<character_class>:], [.<collating_element>.], and
3364    [=<equivalent_class>=].  */
3365
3366 static reg_errcode_t
3367 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3368                       re_token_t *token)
3369 {
3370   unsigned char ch, delim = token->opr.c;
3371   int i = 0;
3372   if (re_string_eoi(regexp))
3373     return REG_EBRACK;
3374   for (;; ++i)
3375     {
3376       if (i >= BRACKET_NAME_BUF_SIZE)
3377         return REG_EBRACK;
3378       if (token->type == OP_OPEN_CHAR_CLASS)
3379         ch = re_string_fetch_byte_case (regexp);
3380       else
3381         ch = re_string_fetch_byte (regexp);
3382       if (re_string_eoi(regexp))
3383         return REG_EBRACK;
3384       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3385         break;
3386       elem->opr.name[i] = ch;
3387     }
3388   re_string_skip_bytes (regexp, 1);
3389   elem->opr.name[i] = '\0';
3390   switch (token->type)
3391     {
3392     case OP_OPEN_COLL_ELEM:
3393       elem->type = COLL_SYM;
3394       break;
3395     case OP_OPEN_EQUIV_CLASS:
3396       elem->type = EQUIV_CLASS;
3397       break;
3398     case OP_OPEN_CHAR_CLASS:
3399       elem->type = CHAR_CLASS;
3400       break;
3401     default:
3402       break;
3403     }
3404   return REG_NOERROR;
3405 }
3406
3407   /* Helper function for parse_bracket_exp.
3408      Build the equivalence class which is represented by NAME.
3409      The result are written to MBCSET and SBCSET.
3410      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3411      is a pointer argument sinse we may update it.  */
3412
3413 static reg_errcode_t
3414 #ifdef RE_ENABLE_I18N
3415 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3416                    Idx *equiv_class_alloc, const unsigned char *name)
3417 #else /* not RE_ENABLE_I18N */
3418 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3419 #endif /* not RE_ENABLE_I18N */
3420 {
3421 #ifdef _LIBC
3422   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3423   if (nrules != 0)
3424     {
3425       const int32_t *table, *indirect;
3426       const unsigned char *weights, *extra, *cp;
3427       unsigned char char_buf[2];
3428       int32_t idx1, idx2;
3429       unsigned int ch;
3430       size_t len;
3431       /* This #include defines a local function!  */
3432 # include <locale/weight.h>
3433       /* Calculate the index for equivalence class.  */
3434       cp = name;
3435       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3436       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3437                                                _NL_COLLATE_WEIGHTMB);
3438       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3439                                                    _NL_COLLATE_EXTRAMB);
3440       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3441                                                 _NL_COLLATE_INDIRECTMB);
3442       idx1 = findidx (&cp);
3443       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3444         /* This isn't a valid character.  */
3445         return REG_ECOLLATE;
3446
3447       /* Build single byte matcing table for this equivalence class.  */
3448       char_buf[1] = (unsigned char) '\0';
3449       len = weights[idx1 & 0xffffff];
3450       for (ch = 0; ch < SBC_MAX; ++ch)
3451         {
3452           char_buf[0] = ch;
3453           cp = char_buf;
3454           idx2 = findidx (&cp);
3455 /*
3456           idx2 = table[ch];
3457 */
3458           if (idx2 == 0)
3459             /* This isn't a valid character.  */
3460             continue;
3461           /* Compare only if the length matches and the collation rule
3462              index is the same.  */
3463           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3464             {
3465               int cnt = 0;
3466
3467               while (cnt <= len &&
3468                      weights[(idx1 & 0xffffff) + 1 + cnt]
3469                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3470                 ++cnt;
3471
3472               if (cnt > len)
3473                 bitset_set (sbcset, ch);
3474             }
3475         }
3476       /* Check whether the array has enough space.  */
3477       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3478         {
3479           /* Not enough, realloc it.  */
3480           /* +1 in case of mbcset->nequiv_classes is 0.  */
3481           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3482           /* Use realloc since the array is NULL if *alloc == 0.  */
3483           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3484                                                    int32_t,
3485                                                    new_equiv_class_alloc);
3486           if (BE (new_equiv_classes == NULL, 0))
3487             return REG_ESPACE;
3488           mbcset->equiv_classes = new_equiv_classes;
3489           *equiv_class_alloc = new_equiv_class_alloc;
3490         }
3491       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3492     }
3493   else
3494 #endif /* _LIBC */
3495     {
3496       if (BE (strlen ((const char *) name) != 1, 0))
3497         return REG_ECOLLATE;
3498       bitset_set (sbcset, *name);
3499     }
3500   return REG_NOERROR;
3501 }
3502
3503   /* Helper function for parse_bracket_exp.
3504      Build the character class which is represented by NAME.
3505      The result are written to MBCSET and SBCSET.
3506      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3507      is a pointer argument sinse we may update it.  */
3508
3509 static reg_errcode_t
3510 #ifdef RE_ENABLE_I18N
3511 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3512                  re_charset_t *mbcset, Idx *char_class_alloc,
3513                  const unsigned char *class_name, reg_syntax_t syntax)
3514 #else /* not RE_ENABLE_I18N */
3515 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3516                  const unsigned char *class_name, reg_syntax_t syntax)
3517 #endif /* not RE_ENABLE_I18N */
3518 {
3519   int i;
3520   const char *name = (const char *) class_name;
3521
3522   /* In case of REG_ICASE "upper" and "lower" match the both of
3523      upper and lower cases.  */
3524   if ((syntax & RE_ICASE)
3525       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3526     name = "alpha";
3527
3528 #ifdef RE_ENABLE_I18N
3529   /* Check the space of the arrays.  */
3530   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3531     {
3532       /* Not enough, realloc it.  */
3533       /* +1 in case of mbcset->nchar_classes is 0.  */
3534       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3535       /* Use realloc since array is NULL if *alloc == 0.  */
3536       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3537                                                new_char_class_alloc);
3538       if (BE (new_char_classes == NULL, 0))
3539         return REG_ESPACE;
3540       mbcset->char_classes = new_char_classes;
3541       *char_class_alloc = new_char_class_alloc;
3542     }
3543   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3544 #endif /* RE_ENABLE_I18N */
3545
3546 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3547   do {                                          \
3548     if (BE (trans != NULL, 0))                  \
3549       {                                         \
3550         for (i = 0; i < SBC_MAX; ++i)           \
3551           if (ctype_func (i))                   \
3552             bitset_set (sbcset, trans[i]);      \
3553       }                                         \
3554     else                                        \
3555       {                                         \
3556         for (i = 0; i < SBC_MAX; ++i)           \
3557           if (ctype_func (i))                   \
3558             bitset_set (sbcset, i);             \
3559       }                                         \
3560   } while (0)
3561
3562   if (strcmp (name, "alnum") == 0)
3563     BUILD_CHARCLASS_LOOP (isalnum);
3564   else if (strcmp (name, "cntrl") == 0)
3565     BUILD_CHARCLASS_LOOP (iscntrl);
3566   else if (strcmp (name, "lower") == 0)
3567     BUILD_CHARCLASS_LOOP (islower);
3568   else if (strcmp (name, "space") == 0)
3569     BUILD_CHARCLASS_LOOP (isspace);
3570   else if (strcmp (name, "alpha") == 0)
3571     BUILD_CHARCLASS_LOOP (isalpha);
3572   else if (strcmp (name, "digit") == 0)
3573     BUILD_CHARCLASS_LOOP (isdigit);
3574   else if (strcmp (name, "print") == 0)
3575     BUILD_CHARCLASS_LOOP (isprint);
3576   else if (strcmp (name, "upper") == 0)
3577     BUILD_CHARCLASS_LOOP (isupper);
3578   else if (strcmp (name, "blank") == 0)
3579     BUILD_CHARCLASS_LOOP (isblank);
3580   else if (strcmp (name, "graph") == 0)
3581     BUILD_CHARCLASS_LOOP (isgraph);
3582   else if (strcmp (name, "punct") == 0)
3583     BUILD_CHARCLASS_LOOP (ispunct);
3584   else if (strcmp (name, "xdigit") == 0)
3585     BUILD_CHARCLASS_LOOP (isxdigit);
3586   else
3587     return REG_ECTYPE;
3588
3589   return REG_NOERROR;
3590 }
3591
3592 static bin_tree_t *
3593 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3594                     const unsigned char *class_name,
3595                     const unsigned char *extra, bool non_match,
3596                     reg_errcode_t *err)
3597 {
3598   re_bitset_ptr_t sbcset;
3599 #ifdef RE_ENABLE_I18N
3600   re_charset_t *mbcset;
3601   Idx alloc = 0;
3602 #endif /* not RE_ENABLE_I18N */
3603   reg_errcode_t ret;
3604   re_token_t br_token;
3605   bin_tree_t *tree;
3606
3607   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3608 #ifdef RE_ENABLE_I18N
3609   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3610 #endif /* RE_ENABLE_I18N */
3611
3612 #ifdef RE_ENABLE_I18N
3613   if (BE (sbcset == NULL || mbcset == NULL, 0))
3614 #else /* not RE_ENABLE_I18N */
3615   if (BE (sbcset == NULL, 0))
3616 #endif /* not RE_ENABLE_I18N */
3617     {
3618       *err = REG_ESPACE;
3619       return NULL;
3620     }
3621
3622   if (non_match)
3623     {
3624 #ifdef RE_ENABLE_I18N
3625       mbcset->non_match = 1;
3626 #endif /* not RE_ENABLE_I18N */
3627     }
3628
3629   /* We don't care the syntax in this case.  */
3630   ret = build_charclass (trans, sbcset,
3631 #ifdef RE_ENABLE_I18N
3632                          mbcset, &alloc,
3633 #endif /* RE_ENABLE_I18N */
3634                          class_name, 0);
3635
3636   if (BE (ret != REG_NOERROR, 0))
3637     {
3638       re_free (sbcset);
3639 #ifdef RE_ENABLE_I18N
3640       free_charset (mbcset);
3641 #endif /* RE_ENABLE_I18N */
3642       *err = ret;
3643       return NULL;
3644     }
3645   /* \w match '_' also.  */
3646   for (; *extra; extra++)
3647     bitset_set (sbcset, *extra);
3648
3649   /* If it is non-matching list.  */
3650   if (non_match)
3651     bitset_not (sbcset);
3652
3653 #ifdef RE_ENABLE_I18N
3654   /* Ensure only single byte characters are set.  */
3655   if (dfa->mb_cur_max > 1)
3656     bitset_mask (sbcset, dfa->sb_char);
3657 #endif
3658
3659   /* Build a tree for simple bracket.  */
3660   br_token.type = SIMPLE_BRACKET;
3661   br_token.opr.sbcset = sbcset;
3662   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3663   if (BE (tree == NULL, 0))
3664     goto build_word_op_espace;
3665
3666 #ifdef RE_ENABLE_I18N
3667   if (dfa->mb_cur_max > 1)
3668     {
3669       bin_tree_t *mbc_tree;
3670       /* Build a tree for complex bracket.  */
3671       br_token.type = COMPLEX_BRACKET;
3672       br_token.opr.mbcset = mbcset;
3673       dfa->has_mb_node = 1;
3674       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3675       if (BE (mbc_tree == NULL, 0))
3676         goto build_word_op_espace;
3677       /* Then join them by ALT node.  */
3678       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3679       if (BE (mbc_tree != NULL, 1))
3680         return tree;
3681     }
3682   else
3683     {
3684       free_charset (mbcset);
3685       return tree;
3686     }
3687 #else /* not RE_ENABLE_I18N */
3688   return tree;
3689 #endif /* not RE_ENABLE_I18N */
3690
3691  build_word_op_espace:
3692   re_free (sbcset);
3693 #ifdef RE_ENABLE_I18N
3694   free_charset (mbcset);
3695 #endif /* RE_ENABLE_I18N */
3696   *err = REG_ESPACE;
3697   return NULL;
3698 }
3699
3700 /* This is intended for the expressions like "a{1,3}".
3701    Fetch a number from `input', and return the number.
3702    Return REG_MISSING if the number field is empty like "{,1}".
3703    Return REG_ERROR if an error occurred.  */
3704
3705 static Idx
3706 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3707 {
3708   Idx num = REG_MISSING;
3709   unsigned char c;
3710   while (1)
3711     {
3712       fetch_token (token, input, syntax);
3713       c = token->opr.c;
3714       if (BE (token->type == END_OF_RE, 0))
3715         return REG_ERROR;
3716       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3717         break;
3718       num = ((token->type != CHARACTER || c < '0' || '9' < c
3719               || num == REG_ERROR)
3720              ? REG_ERROR
3721              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3722       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3723     }
3724   return num;
3725 }
3726 \f
3727 #ifdef RE_ENABLE_I18N
3728 static void
3729 free_charset (re_charset_t *cset)
3730 {
3731   re_free (cset->mbchars);
3732 # ifdef _LIBC
3733   re_free (cset->coll_syms);
3734   re_free (cset->equiv_classes);
3735   re_free (cset->range_starts);
3736   re_free (cset->range_ends);
3737 # endif
3738   re_free (cset->char_classes);
3739   re_free (cset);
3740 }
3741 #endif /* RE_ENABLE_I18N */
3742 \f
3743 /* Functions for binary tree operation.  */
3744
3745 /* Create a tree node.  */
3746
3747 static bin_tree_t *
3748 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3749              re_token_type_t type)
3750 {
3751   re_token_t t;
3752   t.type = type;
3753   return create_token_tree (dfa, left, right, &t);
3754 }
3755
3756 static bin_tree_t *
3757 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3758                    const re_token_t *token)
3759 {
3760   bin_tree_t *tree;
3761   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3762     {
3763       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3764
3765       if (storage == NULL)
3766         return NULL;
3767       storage->next = dfa->str_tree_storage;
3768       dfa->str_tree_storage = storage;
3769       dfa->str_tree_storage_idx = 0;
3770     }
3771   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3772
3773   tree->parent = NULL;
3774   tree->left = left;
3775   tree->right = right;
3776   tree->token = *token;
3777   tree->token.duplicated = 0;
3778   tree->token.opt_subexp = 0;
3779   tree->first = NULL;
3780   tree->next = NULL;
3781   tree->node_idx = REG_MISSING;
3782
3783   if (left != NULL)
3784     left->parent = tree;
3785   if (right != NULL)
3786     right->parent = tree;
3787   return tree;
3788 }
3789
3790 /* Mark the tree SRC as an optional subexpression.
3791    To be called from preorder or postorder.  */
3792
3793 static reg_errcode_t
3794 mark_opt_subexp (void *extra, bin_tree_t *node)
3795 {
3796   Idx idx = (Idx) (long) extra;
3797   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3798     node->token.opt_subexp = 1;
3799
3800   return REG_NOERROR;
3801 }
3802
3803 /* Free the allocated memory inside NODE. */
3804
3805 static void
3806 free_token (re_token_t *node)
3807 {
3808 #ifdef RE_ENABLE_I18N
3809   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3810     free_charset (node->opr.mbcset);
3811   else
3812 #endif /* RE_ENABLE_I18N */
3813     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3814       re_free (node->opr.sbcset);
3815 }
3816
3817 /* Worker function for tree walking.  Free the allocated memory inside NODE
3818    and its children. */
3819
3820 static reg_errcode_t
3821 free_tree (void *extra, bin_tree_t *node)
3822 {
3823   free_token (&node->token);
3824   return REG_NOERROR;
3825 }
3826
3827
3828 /* Duplicate the node SRC, and return new node.  This is a preorder
3829    visit similar to the one implemented by the generic visitor, but
3830    we need more infrastructure to maintain two parallel trees --- so,
3831    it's easier to duplicate.  */
3832
3833 static bin_tree_t *
3834 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3835 {
3836   const bin_tree_t *node;
3837   bin_tree_t *dup_root;
3838   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3839
3840   for (node = root; ; )
3841     {
3842       /* Create a new tree and link it back to the current parent.  */
3843       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3844       if (*p_new == NULL)
3845         return NULL;
3846       (*p_new)->parent = dup_node;
3847       (*p_new)->token.duplicated = 1;
3848       dup_node = *p_new;
3849
3850       /* Go to the left node, or up and to the right.  */
3851       if (node->left)
3852         {
3853           node = node->left;
3854           p_new = &dup_node->left;
3855         }
3856       else
3857         {
3858           const bin_tree_t *prev = NULL;
3859           while (node->right == prev || node->right == NULL)
3860             {
3861               prev = node;
3862               node = node->parent;
3863               dup_node = dup_node->parent;
3864               if (!node)
3865                 return dup_root;
3866             }
3867           node = node->right;
3868           p_new = &dup_node->right;
3869         }
3870     }
3871 }