git.cworth.org Git - tar/blob - gnu/regcomp.c

   1 /* -*- buffer-read-only: t -*- vi: set ro: */
   2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
   3 /* Extended regular expression matching and search library.
   4    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
   5    Software Foundation, Inc.
   6    This file is part of the GNU C Library.
   7    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3, or (at your option)
  12    any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License along
  20    with this program; if not, write to the Free Software Foundation,
  21    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  22
  23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  24                                           size_t length, reg_syntax_t syntax);
  25 static void re_compile_fastmap_iter (regex_t *bufp,
  26                                      const re_dfastate_t *init_state,
  27                                      char *fastmap);
  28 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  29 #ifdef RE_ENABLE_I18N
  30 static void free_charset (re_charset_t *cset);
  31 #endif /* RE_ENABLE_I18N */
  32 static void free_workarea_compile (regex_t *preg);
  33 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  34 #ifdef RE_ENABLE_I18N
  35 static void optimize_utf8 (re_dfa_t *dfa);
  36 #endif
  37 static reg_errcode_t analyze (regex_t *preg);
  38 static reg_errcode_t preorder (bin_tree_t *root,
  39                                reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                void *extra);
  41 static reg_errcode_t postorder (bin_tree_t *root,
  42                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                 void *extra);
  44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  47                                  bin_tree_t *node);
  48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  51 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  52 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  53                                    unsigned int constraint);
  54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  56                                          Idx node, bool root);
  57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  58 static Idx fetch_number (re_string_t *input, re_token_t *token,
  59                          reg_syntax_t syntax);
  60 static int peek_token (re_token_t *token, re_string_t *input,
  61                         reg_syntax_t syntax) internal_function;
  62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  63                           reg_syntax_t syntax, reg_errcode_t *err);
  64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  65                                   re_token_t *token, reg_syntax_t syntax,
  66                                   Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  68                                  re_token_t *token, reg_syntax_t syntax,
  69                                  Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  71                                      re_token_t *token, reg_syntax_t syntax,
  72                                      Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  74                                   re_token_t *token, reg_syntax_t syntax,
  75                                   Idx nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  77                                  re_dfa_t *dfa, re_token_t *token,
  78                                  reg_syntax_t syntax, reg_errcode_t *err);
  79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  80                                       re_token_t *token, reg_syntax_t syntax,
  81                                       reg_errcode_t *err);
  82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  83                                             re_string_t *regexp,
  84                                             re_token_t *token, int token_len,
  85                                             re_dfa_t *dfa,
  86                                             reg_syntax_t syntax,
  87                                             bool accept_hyphen);
  88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  89                                           re_string_t *regexp,
  90                                           re_token_t *token);
  91 #ifdef RE_ENABLE_I18N
  92 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  93                                         re_charset_t *mbcset,
  94                                         Idx *equiv_class_alloc,
  95                                         const unsigned char *name);
  96 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  97                                       bitset_t sbcset,
  98                                       re_charset_t *mbcset,
  99                                       Idx *char_class_alloc,
 100                                       const unsigned char *class_name,
 101                                       reg_syntax_t syntax);
 102 #else  /* not RE_ENABLE_I18N */
 103 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 104                                         const unsigned char *name);
 105 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 106                                       bitset_t sbcset,
 107                                       const unsigned char *class_name,
 108                                       reg_syntax_t syntax);
 109 #endif /* not RE_ENABLE_I18N */
 110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 111                                        RE_TRANSLATE_TYPE trans,
 112                                        const unsigned char *class_name,
 113                                        const unsigned char *extra,
 114                                        bool non_match, reg_errcode_t *err);
 115 static bin_tree_t *create_tree (re_dfa_t *dfa,
 116                                 bin_tree_t *left, bin_tree_t *right,
 117                                 re_token_type_t type);
 118 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 119                                       bin_tree_t *left, bin_tree_t *right,
 120                                       const re_token_t *token);
 121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 122 static void free_token (re_token_t *node);
 123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 125 \f
 126 /* This table gives an error message for each of the error codes listed
 127    in regex.h.  Obviously the order here has to be same as there.
 128    POSIX doesn't require that we do anything for REG_NOERROR,
 129    but why not be nice?  */
 130
 131 static const char __re_error_msgid[] =
 132   {
 133 #define REG_NOERROR_IDX 0
 134     gettext_noop ("Success")    /* REG_NOERROR */
 135     "\0"
 136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 137     gettext_noop ("No match")   /* REG_NOMATCH */
 138     "\0"
 139 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 140     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 141     "\0"
 142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 143     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 144     "\0"
 145 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 146     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 147     "\0"
 148 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 149     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 150     "\0"
 151 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 152     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 153     "\0"
 154 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 155     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 156     "\0"
 157 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 158     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 159     "\0"
 160 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 161     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 162     "\0"
 163 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 164     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 165     "\0"
 166 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 167     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 168     "\0"
 169 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 170     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 171     "\0"
 172 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 173     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 174     "\0"
 175 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 176     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 177     "\0"
 178 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 179     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 180     "\0"
 181 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 182     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 183   };
 184
 185 static const size_t __re_error_msgid_idx[] =
 186   {
 187     REG_NOERROR_IDX,
 188     REG_NOMATCH_IDX,
 189     REG_BADPAT_IDX,
 190     REG_ECOLLATE_IDX,
 191     REG_ECTYPE_IDX,
 192     REG_EESCAPE_IDX,
 193     REG_ESUBREG_IDX,
 194     REG_EBRACK_IDX,
 195     REG_EPAREN_IDX,
 196     REG_EBRACE_IDX,
 197     REG_BADBR_IDX,
 198     REG_ERANGE_IDX,
 199     REG_ESPACE_IDX,
 200     REG_BADRPT_IDX,
 201     REG_EEND_IDX,
 202     REG_ESIZE_IDX,
 203     REG_ERPAREN_IDX
 204   };
 205 \f
 206 /* Entry points for GNU code.  */
 207
 208 /* re_compile_pattern is the GNU regular expression compiler: it
 209    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 210    Returns 0 if the pattern was valid, otherwise an error string.
 211
 212    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 213    are set in BUFP on entry.  */
 214
 215 #ifdef _LIBC
 216 const char *
 217 re_compile_pattern (pattern, length, bufp)
 218     const char *pattern;
 219     size_t length;
 220     struct re_pattern_buffer *bufp;
 221 #else /* size_t might promote */
 222 const char *
 223 re_compile_pattern (const char *pattern, size_t length,
 224                     struct re_pattern_buffer *bufp)
 225 #endif
 226 {
 227   reg_errcode_t ret;
 228
 229   /* And GNU code determines whether or not to get register information
 230      by passing null for the REGS argument to re_match, etc., not by
 231      setting no_sub, unless RE_NO_SUB is set.  */
 232   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 233
 234   /* Match anchors at newline.  */
 235   bufp->newline_anchor = 1;
 236
 237   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 238
 239   if (!ret)
 240     return NULL;
 241   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 242 }
 243 #ifdef _LIBC
 244 weak_alias (__re_compile_pattern, re_compile_pattern)
 245 #endif
 246
 247 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 248    also be assigned to arbitrarily: each pattern buffer stores its own
 249    syntax, so it can be changed between regex compilations.  */
 250 /* This has no initializer because initialized variables in Emacs
 251    become read-only after dumping.  */
 252 reg_syntax_t re_syntax_options;
 253
 254
 255 /* Specify the precise syntax of regexps for compilation.  This provides
 256    for compatibility for various utilities which historically have
 257    different, incompatible syntaxes.
 258
 259    The argument SYNTAX is a bit mask comprised of the various bits
 260    defined in regex.h.  We return the old syntax.  */
 261
 262 reg_syntax_t
 263 re_set_syntax (syntax)
 264     reg_syntax_t syntax;
 265 {
 266   reg_syntax_t ret = re_syntax_options;
 267
 268   re_syntax_options = syntax;
 269   return ret;
 270 }
 271 #ifdef _LIBC
 272 weak_alias (__re_set_syntax, re_set_syntax)
 273 #endif
 274
 275 int
 276 re_compile_fastmap (bufp)
 277     struct re_pattern_buffer *bufp;
 278 {
 279   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 280   char *fastmap = bufp->fastmap;
 281
 282   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 283   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 284   if (dfa->init_state != dfa->init_state_word)
 285     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 286   if (dfa->init_state != dfa->init_state_nl)
 287     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 288   if (dfa->init_state != dfa->init_state_begbuf)
 289     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 290   bufp->fastmap_accurate = 1;
 291   return 0;
 292 }
 293 #ifdef _LIBC
 294 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 295 #endif
 296
 297 static inline void
 298 __attribute ((always_inline))
 299 re_set_fastmap (char *fastmap, bool icase, int ch)
 300 {
 301   fastmap[ch] = 1;
 302   if (icase)
 303     fastmap[tolower (ch)] = 1;
 304 }
 305
 306 /* Helper function for re_compile_fastmap.
 307    Compile fastmap for the initial_state INIT_STATE.  */
 308
 309 static void
 310 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 311                          char *fastmap)
 312 {
 313   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 314   Idx node_cnt;
 315   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 316   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 317     {
 318       Idx node = init_state->nodes.elems[node_cnt];
 319       re_token_type_t type = dfa->nodes[node].type;
 320
 321       if (type == CHARACTER)
 322         {
 323           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 324 #ifdef RE_ENABLE_I18N
 325           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 326             {
 327               unsigned char buf[MB_LEN_MAX];
 328               unsigned char *p;
 329               wchar_t wc;
 330               mbstate_t state;
 331
 332               p = buf;
 333               *p++ = dfa->nodes[node].opr.c;
 334               while (++node < dfa->nodes_len
 335                      && dfa->nodes[node].type == CHARACTER
 336                      && dfa->nodes[node].mb_partial)
 337                 *p++ = dfa->nodes[node].opr.c;
 338               memset (&state, '\0', sizeof (state));
 339               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 340                              &state) == p - buf
 341                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 342                       != (size_t) -1))
 343                 re_set_fastmap (fastmap, false, buf[0]);
 344             }
 345 #endif
 346         }
 347       else if (type == SIMPLE_BRACKET)
 348         {
 349           int i, ch;
 350           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 351             {
 352               int j;
 353               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 354               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 355                 if (w & ((bitset_word_t) 1 << j))
 356                   re_set_fastmap (fastmap, icase, ch);
 357             }
 358         }
 359 #ifdef RE_ENABLE_I18N
 360       else if (type == COMPLEX_BRACKET)
 361         {
 362           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 363           Idx i;
 364
 365 # ifdef _LIBC
 366           /* See if we have to try all bytes which start multiple collation
 367              elements.
 368              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 369                   collation element, and don't catch 'b' since 'b' is
 370                   the only collation element which starts from 'b' (and
 371                   it is caught by SIMPLE_BRACKET).  */
 372               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 373                   && (cset->ncoll_syms || cset->nranges))
 374                 {
 375                   const int32_t *table = (const int32_t *)
 376                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 377                   for (i = 0; i < SBC_MAX; ++i)
 378                     if (table[i] < 0)
 379                       re_set_fastmap (fastmap, icase, i);
 380                 }
 381 # endif /* _LIBC */
 382
 383           /* See if we have to start the match at all multibyte characters,
 384              i.e. where we would not find an invalid sequence.  This only
 385              applies to multibyte character sets; for single byte character
 386              sets, the SIMPLE_BRACKET again suffices.  */
 387           if (dfa->mb_cur_max > 1
 388               && (cset->nchar_classes || cset->non_match || cset->nranges
 389 # ifdef _LIBC
 390                   || cset->nequiv_classes
 391 # endif /* _LIBC */
 392                  ))
 393             {
 394               unsigned char c = 0;
 395               do
 396                 {
 397                   mbstate_t mbs;
 398                   memset (&mbs, 0, sizeof (mbs));
 399                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 400                     re_set_fastmap (fastmap, false, (int) c);
 401                 }
 402               while (++c != 0);
 403             }
 404
 405           else
 406             {
 407               /* ... Else catch all bytes which can start the mbchars.  */
 408               for (i = 0; i < cset->nmbchars; ++i)
 409                 {
 410                   char buf[256];
 411                   mbstate_t state;
 412                   memset (&state, '\0', sizeof (state));
 413                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 414                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 415                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 416                     {
 417                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 418                           != (size_t) -1)
 419                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 420                     }
 421                 }
 422             }
 423         }
 424 #endif /* RE_ENABLE_I18N */
 425       else if (type == OP_PERIOD
 426 #ifdef RE_ENABLE_I18N
 427                || type == OP_UTF8_PERIOD
 428 #endif /* RE_ENABLE_I18N */
 429                || type == END_OF_RE)
 430         {
 431           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 432           if (type == END_OF_RE)
 433             bufp->can_be_null = 1;
 434           return;
 435         }
 436     }
 437 }
 438 \f
 439 /* Entry point for POSIX code.  */
 440 /* regcomp takes a regular expression as a string and compiles it.
 441
 442    PREG is a regex_t *.  We do not expect any fields to be initialized,
 443    since POSIX says we shouldn't.  Thus, we set
 444
 445      `buffer' to the compiled pattern;
 446      `used' to the length of the compiled pattern;
 447      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 448        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 449        RE_SYNTAX_POSIX_BASIC;
 450      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 451      `fastmap' to an allocated space for the fastmap;
 452      `fastmap_accurate' to zero;
 453      `re_nsub' to the number of subexpressions in PATTERN.
 454
 455    PATTERN is the address of the pattern string.
 456
 457    CFLAGS is a series of bits which affect compilation.
 458
 459      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 460      use POSIX basic syntax.
 461
 462      If REG_NEWLINE is set, then . and [^...] don't match newline.
 463      Also, regexec will try a match beginning after every newline.
 464
 465      If REG_ICASE is set, then we considers upper- and lowercase
 466      versions of letters to be equivalent when matching.
 467
 468      If REG_NOSUB is set, then when PREG is passed to regexec, that
 469      routine will report only success or failure, and nothing about the
 470      registers.
 471
 472    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 473    the return codes and their meanings.)  */
 474
 475 int
 476 regcomp (preg, pattern, cflags)
 477     regex_t *_Restrict_ preg;
 478     const char *_Restrict_ pattern;
 479     int cflags;
 480 {
 481   reg_errcode_t ret;
 482   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 483                          : RE_SYNTAX_POSIX_BASIC);
 484
 485   preg->buffer = NULL;
 486   preg->allocated = 0;
 487   preg->used = 0;
 488
 489   /* Try to allocate space for the fastmap.  */
 490   preg->fastmap = re_malloc (char, SBC_MAX);
 491   if (BE (preg->fastmap == NULL, 0))
 492     return REG_ESPACE;
 493
 494   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 495
 496   /* If REG_NEWLINE is set, newlines are treated differently.  */
 497   if (cflags & REG_NEWLINE)
 498     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 499       syntax &= ~RE_DOT_NEWLINE;
 500       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 501       /* It also changes the matching behavior.  */
 502       preg->newline_anchor = 1;
 503     }
 504   else
 505     preg->newline_anchor = 0;
 506   preg->no_sub = !!(cflags & REG_NOSUB);
 507   preg->translate = NULL;
 508
 509   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 510
 511   /* POSIX doesn't distinguish between an unmatched open-group and an
 512      unmatched close-group: both are REG_EPAREN.  */
 513   if (ret == REG_ERPAREN)
 514     ret = REG_EPAREN;
 515
 516   /* We have already checked preg->fastmap != NULL.  */
 517   if (BE (ret == REG_NOERROR, 1))
 518     /* Compute the fastmap now, since regexec cannot modify the pattern
 519        buffer.  This function never fails in this implementation.  */
 520     (void) re_compile_fastmap (preg);
 521   else
 522     {
 523       /* Some error occurred while compiling the expression.  */
 524       re_free (preg->fastmap);
 525       preg->fastmap = NULL;
 526     }
 527
 528   return (int) ret;
 529 }
 530 #ifdef _LIBC
 531 weak_alias (__regcomp, regcomp)
 532 #endif
 533
 534 /* Returns a message corresponding to an error code, ERRCODE, returned
 535    from either regcomp or regexec.   We don't use PREG here.  */
 536
 537 #ifdef _LIBC
 538 size_t
 539 regerror (errcode, preg, errbuf, errbuf_size)
 540     int errcode;
 541     const regex_t *_Restrict_ preg;
 542     char *_Restrict_ errbuf;
 543     size_t errbuf_size;
 544 #else /* size_t might promote */
 545 size_t
 546 regerror (int errcode, const regex_t *_Restrict_ preg,
 547           char *_Restrict_ errbuf, size_t errbuf_size)
 548 #endif
 549 {
 550   const char *msg;
 551   size_t msg_size;
 552
 553   if (BE (errcode < 0
 554           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 555                                / sizeof (__re_error_msgid_idx[0])), 0))
 556     /* Only error codes returned by the rest of the code should be passed
 557        to this routine.  If we are given anything else, or if other regex
 558        code generates an invalid error code, then the program has a bug.
 559        Dump core so we can fix it.  */
 560     abort ();
 561
 562   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 563
 564   msg_size = strlen (msg) + 1; /* Includes the null.  */
 565
 566   if (BE (errbuf_size != 0, 1))
 567     {
 568       size_t cpy_size = msg_size;
 569       if (BE (msg_size > errbuf_size, 0))
 570         {
 571           cpy_size = errbuf_size - 1;
 572           errbuf[cpy_size] = '\0';
 573         }
 574       memcpy (errbuf, msg, cpy_size);
 575     }
 576
 577   return msg_size;
 578 }
 579 #ifdef _LIBC
 580 weak_alias (__regerror, regerror)
 581 #endif
 582
 583
 584 #ifdef RE_ENABLE_I18N
 585 /* This static array is used for the map to single-byte characters when
 586    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 587    it the same all the time.  UTF-8 is the preferred encoding so this is
 588    a worthwhile optimization.  */
 589 static const bitset_t utf8_sb_map =
 590 {
 591   /* Set the first 128 bits.  */
 592 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 593 #  error "bitset_word_t is narrower than 32 bits"
 594 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 595   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 596 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 597   BITSET_WORD_MAX, BITSET_WORD_MAX,
 598 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 599   BITSET_WORD_MAX,
 600 # endif
 601   (BITSET_WORD_MAX
 602    >> (SBC_MAX % BITSET_WORD_BITS == 0
 603        ? 0
 604        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 605 };
 606 #endif
 607
 608
 609 static void
 610 free_dfa_content (re_dfa_t *dfa)
 611 {
 612   Idx i, j;
 613
 614   if (dfa->nodes)
 615     for (i = 0; i < dfa->nodes_len; ++i)
 616       free_token (dfa->nodes + i);
 617   re_free (dfa->nexts);
 618   for (i = 0; i < dfa->nodes_len; ++i)
 619     {
 620       if (dfa->eclosures != NULL)
 621         re_node_set_free (dfa->eclosures + i);
 622       if (dfa->inveclosures != NULL)
 623         re_node_set_free (dfa->inveclosures + i);
 624       if (dfa->edests != NULL)
 625         re_node_set_free (dfa->edests + i);
 626     }
 627   re_free (dfa->edests);
 628   re_free (dfa->eclosures);
 629   re_free (dfa->inveclosures);
 630   re_free (dfa->nodes);
 631
 632   if (dfa->state_table)
 633     for (i = 0; i <= dfa->state_hash_mask; ++i)
 634       {
 635         struct re_state_table_entry *entry = dfa->state_table + i;
 636         for (j = 0; j < entry->num; ++j)
 637           {
 638             re_dfastate_t *state = entry->array[j];
 639             free_state (state);
 640           }
 641         re_free (entry->array);
 642       }
 643   re_free (dfa->state_table);
 644 #ifdef RE_ENABLE_I18N
 645   if (dfa->sb_char != utf8_sb_map)
 646     re_free (dfa->sb_char);
 647 #endif
 648   re_free (dfa->subexp_map);
 649 #ifdef DEBUG
 650   re_free (dfa->re_str);
 651 #endif
 652
 653   re_free (dfa);
 654 }
 655
 656
 657 /* Free dynamically allocated space used by PREG.  */
 658
 659 void
 660 regfree (preg)
 661     regex_t *preg;
 662 {
 663   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 664   if (BE (dfa != NULL, 1))
 665     free_dfa_content (dfa);
 666   preg->buffer = NULL;
 667   preg->allocated = 0;
 668
 669   re_free (preg->fastmap);
 670   preg->fastmap = NULL;
 671
 672   re_free (preg->translate);
 673   preg->translate = NULL;
 674 }
 675 #ifdef _LIBC
 676 weak_alias (__regfree, regfree)
 677 #endif
 678 \f
 679 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 680    them unless specifically requested.  */
 681
 682 #if defined _REGEX_RE_COMP || defined _LIBC
 683
 684 /* BSD has one and only one pattern buffer.  */
 685 static struct re_pattern_buffer re_comp_buf;
 686
 687 char *
 688 # ifdef _LIBC
 689 /* Make these definitions weak in libc, so POSIX programs can redefine
 690    these names if they don't use our functions, and still use
 691    regcomp/regexec above without link errors.  */
 692 weak_function
 693 # endif
 694 re_comp (s)
 695      const char *s;
 696 {
 697   reg_errcode_t ret;
 698   char *fastmap;
 699
 700   if (!s)
 701     {
 702       if (!re_comp_buf.buffer)
 703         return gettext ("No previous regular expression");
 704       return 0;
 705     }
 706
 707   if (re_comp_buf.buffer)
 708     {
 709       fastmap = re_comp_buf.fastmap;
 710       re_comp_buf.fastmap = NULL;
 711       __regfree (&re_comp_buf);
 712       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 713       re_comp_buf.fastmap = fastmap;
 714     }
 715
 716   if (re_comp_buf.fastmap == NULL)
 717     {
 718       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 719       if (re_comp_buf.fastmap == NULL)
 720         return (char *) gettext (__re_error_msgid
 721                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 722     }
 723
 724   /* Since `re_exec' always passes NULL for the `regs' argument, we
 725      don't need to initialize the pattern buffer fields which affect it.  */
 726
 727   /* Match anchors at newlines.  */
 728   re_comp_buf.newline_anchor = 1;
 729
 730   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 731
 732   if (!ret)
 733     return NULL;
 734
 735   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 736   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 737 }
 738
 739 #ifdef _LIBC
 740 libc_freeres_fn (free_mem)
 741 {
 742   __regfree (&re_comp_buf);
 743 }
 744 #endif
 745
 746 #endif /* _REGEX_RE_COMP */
 747 \f
 748 /* Internal entry point.
 749    Compile the regular expression PATTERN, whose length is LENGTH.
 750    SYNTAX indicate regular expression's syntax.  */
 751
 752 static reg_errcode_t
 753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 754                      reg_syntax_t syntax)
 755 {
 756   reg_errcode_t err = REG_NOERROR;
 757   re_dfa_t *dfa;
 758   re_string_t regexp;
 759
 760   /* Initialize the pattern buffer.  */
 761   preg->fastmap_accurate = 0;
 762   preg->syntax = syntax;
 763   preg->not_bol = preg->not_eol = 0;
 764   preg->used = 0;
 765   preg->re_nsub = 0;
 766   preg->can_be_null = 0;
 767   preg->regs_allocated = REGS_UNALLOCATED;
 768
 769   /* Initialize the dfa.  */
 770   dfa = (re_dfa_t *) preg->buffer;
 771   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 772     {
 773       /* If zero allocated, but buffer is non-null, try to realloc
 774          enough space.  This loses if buffer's address is bogus, but
 775          that is the user's responsibility.  If ->buffer is NULL this
 776          is a simple allocation.  */
 777       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 778       if (dfa == NULL)
 779         return REG_ESPACE;
 780       preg->allocated = sizeof (re_dfa_t);
 781       preg->buffer = (unsigned char *) dfa;
 782     }
 783   preg->used = sizeof (re_dfa_t);
 784
 785   err = init_dfa (dfa, length);
 786   if (BE (err != REG_NOERROR, 0))
 787     {
 788       free_dfa_content (dfa);
 789       preg->buffer = NULL;
 790       preg->allocated = 0;
 791       return err;
 792     }
 793 #ifdef DEBUG
 794   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 795   dfa->re_str = re_malloc (char, length + 1);
 796   strncpy (dfa->re_str, pattern, length + 1);
 797 #endif
 798
 799   __libc_lock_init (dfa->lock);
 800
 801   err = re_string_construct (&regexp, pattern, length, preg->translate,
 802                              (syntax & RE_ICASE) != 0, dfa);
 803   if (BE (err != REG_NOERROR, 0))
 804     {
 805     re_compile_internal_free_return:
 806       free_workarea_compile (preg);
 807       re_string_destruct (&regexp);
 808       free_dfa_content (dfa);
 809       preg->buffer = NULL;
 810       preg->allocated = 0;
 811       return err;
 812     }
 813
 814   /* Parse the regular expression, and build a structure tree.  */
 815   preg->re_nsub = 0;
 816   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 817   if (BE (dfa->str_tree == NULL, 0))
 818     goto re_compile_internal_free_return;
 819
 820   /* Analyze the tree and create the nfa.  */
 821   err = analyze (preg);
 822   if (BE (err != REG_NOERROR, 0))
 823     goto re_compile_internal_free_return;
 824
 825 #ifdef RE_ENABLE_I18N
 826   /* If possible, do searching in single byte encoding to speed things up.  */
 827   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 828     optimize_utf8 (dfa);
 829 #endif
 830
 831   /* Then create the initial state of the dfa.  */
 832   err = create_initial_state (dfa);
 833
 834   /* Release work areas.  */
 835   free_workarea_compile (preg);
 836   re_string_destruct (&regexp);
 837
 838   if (BE (err != REG_NOERROR, 0))
 839     {
 840       free_dfa_content (dfa);
 841       preg->buffer = NULL;
 842       preg->allocated = 0;
 843     }
 844
 845   return err;
 846 }
 847
 848 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 849    as the initial length of some arrays.  */
 850
 851 static reg_errcode_t
 852 init_dfa (re_dfa_t *dfa, size_t pat_len)
 853 {
 854   __re_size_t table_size;
 855 #ifndef _LIBC
 856   char *codeset_name;
 857 #endif
 858 #ifdef RE_ENABLE_I18N
 859   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 860 #else
 861   size_t max_i18n_object_size = 0;
 862 #endif
 863   size_t max_object_size =
 864     MAX (sizeof (struct re_state_table_entry),
 865          MAX (sizeof (re_token_t),
 866               MAX (sizeof (re_node_set),
 867                    MAX (sizeof (regmatch_t),
 868                         max_i18n_object_size))));
 869
 870   memset (dfa, '\0', sizeof (re_dfa_t));
 871
 872   /* Force allocation of str_tree_storage the first time.  */
 873   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 874
 875   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 876      calculation below, and for similar doubling calculations
 877      elsewhere.  And it's <= rather than <, because some of the
 878      doubling calculations add 1 afterwards.  */
 879   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 880     return REG_ESPACE;
 881
 882   dfa->nodes_alloc = pat_len + 1;
 883   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 884
 885   /*  table_size = 2 ^ ceil(log pat_len) */
 886   for (table_size = 1; ; table_size <<= 1)
 887     if (table_size > pat_len)
 888       break;
 889
 890   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 891   dfa->state_hash_mask = table_size - 1;
 892
 893   dfa->mb_cur_max = MB_CUR_MAX;
 894 #ifdef _LIBC
 895   if (dfa->mb_cur_max == 6
 896       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 897     dfa->is_utf8 = 1;
 898   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 899                        != 0);
 900 #else
 901   codeset_name = nl_langinfo (CODESET);
 902   if (strcasecmp (codeset_name, "UTF-8") == 0
 903       || strcasecmp (codeset_name, "UTF8") == 0)
 904     dfa->is_utf8 = 1;
 905
 906   /* We check exhaustively in the loop below if this charset is a
 907      superset of ASCII.  */
 908   dfa->map_notascii = 0;
 909 #endif
 910
 911 #ifdef RE_ENABLE_I18N
 912   if (dfa->mb_cur_max > 1)
 913     {
 914       if (dfa->is_utf8)
 915         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 916       else
 917         {
 918           int i, j, ch;
 919
 920           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 921           if (BE (dfa->sb_char == NULL, 0))
 922             return REG_ESPACE;
 923
 924           /* Set the bits corresponding to single byte chars.  */
 925           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 926             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 927               {
 928                 wint_t wch = __btowc (ch);
 929                 if (wch != WEOF)
 930                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 931 # ifndef _LIBC
 932                 if (isascii (ch) && wch != ch)
 933                   dfa->map_notascii = 1;
 934 # endif
 935               }
 936         }
 937     }
 938 #endif
 939
 940   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 941     return REG_ESPACE;
 942   return REG_NOERROR;
 943 }
 944
 945 /* Initialize WORD_CHAR table, which indicate which character is
 946    "word".  In this case "word" means that it is the word construction
 947    character used by some operators like "\<", "\>", etc.  */
 948
 949 static void
 950 internal_function
 951 init_word_char (re_dfa_t *dfa)
 952 {
 953   int i, j, ch;
 954   dfa->word_ops_used = 1;
 955   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 956     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 957       if (isalnum (ch) || ch == '_')
 958         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 959 }
 960
 961 /* Free the work area which are only used while compiling.  */
 962
 963 static void
 964 free_workarea_compile (regex_t *preg)
 965 {
 966   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 967   bin_tree_storage_t *storage, *next;
 968   for (storage = dfa->str_tree_storage; storage; storage = next)
 969     {
 970       next = storage->next;
 971       re_free (storage);
 972     }
 973   dfa->str_tree_storage = NULL;
 974   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 975   dfa->str_tree = NULL;
 976   re_free (dfa->org_indices);
 977   dfa->org_indices = NULL;
 978 }
 979
 980 /* Create initial states for all contexts.  */
 981
 982 static reg_errcode_t
 983 create_initial_state (re_dfa_t *dfa)
 984 {
 985   Idx first, i;
 986   reg_errcode_t err;
 987   re_node_set init_nodes;
 988
 989   /* Initial states have the epsilon closure of the node which is
 990      the first node of the regular expression.  */
 991   first = dfa->str_tree->first->node_idx;
 992   dfa->init_node = first;
 993   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 994   if (BE (err != REG_NOERROR, 0))
 995     return err;
 996
 997   /* The back-references which are in initial states can epsilon transit,
 998      since in this case all of the subexpressions can be null.
 999      Then we add epsilon closures of the nodes which are the next nodes of
1000      the back-references.  */
1001   if (dfa->nbackref > 0)
1002     for (i = 0; i < init_nodes.nelem; ++i)
1003       {
1004         Idx node_idx = init_nodes.elems[i];
1005         re_token_type_t type = dfa->nodes[node_idx].type;
1006
1007         Idx clexp_idx;
1008         if (type != OP_BACK_REF)
1009           continue;
1010         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1011           {
1012             re_token_t *clexp_node;
1013             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1014             if (clexp_node->type == OP_CLOSE_SUBEXP
1015                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1016               break;
1017           }
1018         if (clexp_idx == init_nodes.nelem)
1019           continue;
1020
1021         if (type == OP_BACK_REF)
1022           {
1023             Idx dest_idx = dfa->edests[node_idx].elems[0];
1024             if (!re_node_set_contains (&init_nodes, dest_idx))
1025               {
1026                 reg_errcode_t merge_err
1027                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1028                 if (merge_err != REG_NOERROR)
1029                   return merge_err;
1030                 i = 0;
1031               }
1032           }
1033       }
1034
1035   /* It must be the first time to invoke acquire_state.  */
1036   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1037   /* We don't check ERR here, since the initial state must not be NULL.  */
1038   if (BE (dfa->init_state == NULL, 0))
1039     return err;
1040   if (dfa->init_state->has_constraint)
1041     {
1042       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1043                                                        CONTEXT_WORD);
1044       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1045                                                      CONTEXT_NEWLINE);
1046       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1047                                                          &init_nodes,
1048                                                          CONTEXT_NEWLINE
1049                                                          | CONTEXT_BEGBUF);
1050       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1051               || dfa->init_state_begbuf == NULL, 0))
1052         return err;
1053     }
1054   else
1055     dfa->init_state_word = dfa->init_state_nl
1056       = dfa->init_state_begbuf = dfa->init_state;
1057
1058   re_node_set_free (&init_nodes);
1059   return REG_NOERROR;
1060 }
1061 \f
1062 #ifdef RE_ENABLE_I18N
1063 /* If it is possible to do searching in single byte encoding instead of UTF-8
1064    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1065    DFA nodes where needed.  */
1066
1067 static void
1068 optimize_utf8 (re_dfa_t *dfa)
1069 {
1070   Idx node;
1071   int i;
1072   bool mb_chars = false;
1073   bool has_period = false;
1074
1075   for (node = 0; node < dfa->nodes_len; ++node)
1076     switch (dfa->nodes[node].type)
1077       {
1078       case CHARACTER:
1079         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1080           mb_chars = true;
1081         break;
1082       case ANCHOR:
1083         switch (dfa->nodes[node].opr.ctx_type)
1084           {
1085           case LINE_FIRST:
1086           case LINE_LAST:
1087           case BUF_FIRST:
1088           case BUF_LAST:
1089             break;
1090           default:
1091             /* Word anchors etc. cannot be handled.  It's okay to test
1092                opr.ctx_type since constraints (for all DFA nodes) are
1093                created by ORing one or more opr.ctx_type values.  */
1094             return;
1095           }
1096         break;
1097       case OP_PERIOD:
1098         has_period = true;
1099         break;
1100       case OP_BACK_REF:
1101       case OP_ALT:
1102       case END_OF_RE:
1103       case OP_DUP_ASTERISK:
1104       case OP_OPEN_SUBEXP:
1105       case OP_CLOSE_SUBEXP:
1106         break;
1107       case COMPLEX_BRACKET:
1108         return;
1109       case SIMPLE_BRACKET:
1110         /* Just double check.  */
1111         {
1112           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1113                         ? 0
1114                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1115           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1116             {
1117               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1118                 return;
1119               rshift = 0;
1120             }
1121         }
1122         break;
1123       default:
1124         abort ();
1125       }
1126
1127   if (mb_chars || has_period)
1128     for (node = 0; node < dfa->nodes_len; ++node)
1129       {
1130         if (dfa->nodes[node].type == CHARACTER
1131             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1132           dfa->nodes[node].mb_partial = 0;
1133         else if (dfa->nodes[node].type == OP_PERIOD)
1134           dfa->nodes[node].type = OP_UTF8_PERIOD;
1135       }
1136
1137   /* The search can be in single byte locale.  */
1138   dfa->mb_cur_max = 1;
1139   dfa->is_utf8 = 0;
1140   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1141 }
1142 #endif
1143 \f
1144 /* Analyze the structure tree, and calculate "first", "next", "edest",
1145    "eclosure", and "inveclosure".  */
1146
1147 static reg_errcode_t
1148 analyze (regex_t *preg)
1149 {
1150   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1151   reg_errcode_t ret;
1152
1153   /* Allocate arrays.  */
1154   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1155   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1156   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1157   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1158   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1159           || dfa->eclosures == NULL, 0))
1160     return REG_ESPACE;
1161
1162   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1163   if (dfa->subexp_map != NULL)
1164     {
1165       Idx i;
1166       for (i = 0; i < preg->re_nsub; i++)
1167         dfa->subexp_map[i] = i;
1168       preorder (dfa->str_tree, optimize_subexps, dfa);
1169       for (i = 0; i < preg->re_nsub; i++)
1170         if (dfa->subexp_map[i] != i)
1171           break;
1172       if (i == preg->re_nsub)
1173         {
1174           free (dfa->subexp_map);
1175           dfa->subexp_map = NULL;
1176         }
1177     }
1178
1179   ret = postorder (dfa->str_tree, lower_subexps, preg);
1180   if (BE (ret != REG_NOERROR, 0))
1181     return ret;
1182   ret = postorder (dfa->str_tree, calc_first, dfa);
1183   if (BE (ret != REG_NOERROR, 0))
1184     return ret;
1185   preorder (dfa->str_tree, calc_next, dfa);
1186   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1187   if (BE (ret != REG_NOERROR, 0))
1188     return ret;
1189   ret = calc_eclosure (dfa);
1190   if (BE (ret != REG_NOERROR, 0))
1191     return ret;
1192
1193   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1194      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1195   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1196       || dfa->nbackref)
1197     {
1198       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1199       if (BE (dfa->inveclosures == NULL, 0))
1200         return REG_ESPACE;
1201       ret = calc_inveclosure (dfa);
1202     }
1203
1204   return ret;
1205 }
1206
1207 /* Our parse trees are very unbalanced, so we cannot use a stack to
1208    implement parse tree visits.  Instead, we use parent pointers and
1209    some hairy code in these two functions.  */
1210 static reg_errcode_t
1211 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1212            void *extra)
1213 {
1214   bin_tree_t *node, *prev;
1215
1216   for (node = root; ; )
1217     {
1218       /* Descend down the tree, preferably to the left (or to the right
1219          if that's the only child).  */
1220       while (node->left || node->right)
1221         if (node->left)
1222           node = node->left;
1223         else
1224           node = node->right;
1225
1226       do
1227         {
1228           reg_errcode_t err = fn (extra, node);
1229           if (BE (err != REG_NOERROR, 0))
1230             return err;
1231           if (node->parent == NULL)
1232             return REG_NOERROR;
1233           prev = node;
1234           node = node->parent;
1235         }
1236       /* Go up while we have a node that is reached from the right.  */
1237       while (node->right == prev || node->right == NULL);
1238       node = node->right;
1239     }
1240 }
1241
1242 static reg_errcode_t
1243 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1244           void *extra)
1245 {
1246   bin_tree_t *node;
1247
1248   for (node = root; ; )
1249     {
1250       reg_errcode_t err = fn (extra, node);
1251       if (BE (err != REG_NOERROR, 0))
1252         return err;
1253
1254       /* Go to the left node, or up and to the right.  */
1255       if (node->left)
1256         node = node->left;
1257       else
1258         {
1259           bin_tree_t *prev = NULL;
1260           while (node->right == prev || node->right == NULL)
1261             {
1262               prev = node;
1263               node = node->parent;
1264               if (!node)
1265                 return REG_NOERROR;
1266             }
1267           node = node->right;
1268         }
1269     }
1270 }
1271
1272 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1273    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1274    backreferences as well.  Requires a preorder visit.  */
1275 static reg_errcode_t
1276 optimize_subexps (void *extra, bin_tree_t *node)
1277 {
1278   re_dfa_t *dfa = (re_dfa_t *) extra;
1279
1280   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1281     {
1282       int idx = node->token.opr.idx;
1283       node->token.opr.idx = dfa->subexp_map[idx];
1284       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1285     }
1286
1287   else if (node->token.type == SUBEXP
1288            && node->left && node->left->token.type == SUBEXP)
1289     {
1290       Idx other_idx = node->left->token.opr.idx;
1291
1292       node->left = node->left->left;
1293       if (node->left)
1294         node->left->parent = node;
1295
1296       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1297       if (other_idx < BITSET_WORD_BITS)
1298         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1299     }
1300
1301   return REG_NOERROR;
1302 }
1303
1304 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1305    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1306 static reg_errcode_t
1307 lower_subexps (void *extra, bin_tree_t *node)
1308 {
1309   regex_t *preg = (regex_t *) extra;
1310   reg_errcode_t err = REG_NOERROR;
1311
1312   if (node->left && node->left->token.type == SUBEXP)
1313     {
1314       node->left = lower_subexp (&err, preg, node->left);
1315       if (node->left)
1316         node->left->parent = node;
1317     }
1318   if (node->right && node->right->token.type == SUBEXP)
1319     {
1320       node->right = lower_subexp (&err, preg, node->right);
1321       if (node->right)
1322         node->right->parent = node;
1323     }
1324
1325   return err;
1326 }
1327
1328 static bin_tree_t *
1329 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1330 {
1331   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1332   bin_tree_t *body = node->left;
1333   bin_tree_t *op, *cls, *tree1, *tree;
1334
1335   if (preg->no_sub
1336       /* We do not optimize empty subexpressions, because otherwise we may
1337          have bad CONCAT nodes with NULL children.  This is obviously not
1338          very common, so we do not lose much.  An example that triggers
1339          this case is the sed "script" /\(\)/x.  */
1340       && node->left != NULL
1341       && (node->token.opr.idx >= BITSET_WORD_BITS
1342           || !(dfa->used_bkref_map
1343                & ((bitset_word_t) 1 << node->token.opr.idx))))
1344     return node->left;
1345
1346   /* Convert the SUBEXP node to the concatenation of an
1347      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1348   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1349   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1350   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1351   tree = create_tree (dfa, op, tree1, CONCAT);
1352   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1353     {
1354       *err = REG_ESPACE;
1355       return NULL;
1356     }
1357
1358   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1359   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1360   return tree;
1361 }
1362
1363 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1364    nodes.  Requires a postorder visit.  */
1365 static reg_errcode_t
1366 calc_first (void *extra, bin_tree_t *node)
1367 {
1368   re_dfa_t *dfa = (re_dfa_t *) extra;
1369   if (node->token.type == CONCAT)
1370     {
1371       node->first = node->left->first;
1372       node->node_idx = node->left->node_idx;
1373     }
1374   else
1375     {
1376       node->first = node;
1377       node->node_idx = re_dfa_add_node (dfa, node->token);
1378       if (BE (node->node_idx == REG_MISSING, 0))
1379         return REG_ESPACE;
1380       if (node->token.type == ANCHOR)
1381         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1382     }
1383   return REG_NOERROR;
1384 }
1385
1386 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1387 static reg_errcode_t
1388 calc_next (void *extra, bin_tree_t *node)
1389 {
1390   switch (node->token.type)
1391     {
1392     case OP_DUP_ASTERISK:
1393       node->left->next = node;
1394       break;
1395     case CONCAT:
1396       node->left->next = node->right->first;
1397       node->right->next = node->next;
1398       break;
1399     default:
1400       if (node->left)
1401         node->left->next = node->next;
1402       if (node->right)
1403         node->right->next = node->next;
1404       break;
1405     }
1406   return REG_NOERROR;
1407 }
1408
1409 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1410 static reg_errcode_t
1411 link_nfa_nodes (void *extra, bin_tree_t *node)
1412 {
1413   re_dfa_t *dfa = (re_dfa_t *) extra;
1414   Idx idx = node->node_idx;
1415   reg_errcode_t err = REG_NOERROR;
1416
1417   switch (node->token.type)
1418     {
1419     case CONCAT:
1420       break;
1421
1422     case END_OF_RE:
1423       assert (node->next == NULL);
1424       break;
1425
1426     case OP_DUP_ASTERISK:
1427     case OP_ALT:
1428       {
1429         Idx left, right;
1430         dfa->has_plural_match = 1;
1431         if (node->left != NULL)
1432           left = node->left->first->node_idx;
1433         else
1434           left = node->next->node_idx;
1435         if (node->right != NULL)
1436           right = node->right->first->node_idx;
1437         else
1438           right = node->next->node_idx;
1439         assert (REG_VALID_INDEX (left));
1440         assert (REG_VALID_INDEX (right));
1441         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1442       }
1443       break;
1444
1445     case ANCHOR:
1446     case OP_OPEN_SUBEXP:
1447     case OP_CLOSE_SUBEXP:
1448       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1449       break;
1450
1451     case OP_BACK_REF:
1452       dfa->nexts[idx] = node->next->node_idx;
1453       if (node->token.type == OP_BACK_REF)
1454         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1455       break;
1456
1457     default:
1458       assert (!IS_EPSILON_NODE (node->token.type));
1459       dfa->nexts[idx] = node->next->node_idx;
1460       break;
1461     }
1462
1463   return err;
1464 }
1465
1466 /* Duplicate the epsilon closure of the node ROOT_NODE.
1467    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1468    to their own constraint.  */
1469
1470 static reg_errcode_t
1471 internal_function
1472 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1473                         Idx root_node, unsigned int init_constraint)
1474 {
1475   Idx org_node, clone_node;
1476   bool ok;
1477   unsigned int constraint = init_constraint;
1478   for (org_node = top_org_node, clone_node = top_clone_node;;)
1479     {
1480       Idx org_dest, clone_dest;
1481       if (dfa->nodes[org_node].type == OP_BACK_REF)
1482         {
1483           /* If the back reference epsilon-transit, its destination must
1484              also have the constraint.  Then duplicate the epsilon closure
1485              of the destination of the back reference, and store it in
1486              edests of the back reference.  */
1487           org_dest = dfa->nexts[org_node];
1488           re_node_set_empty (dfa->edests + clone_node);
1489           clone_dest = duplicate_node (dfa, org_dest, constraint);
1490           if (BE (clone_dest == REG_MISSING, 0))
1491             return REG_ESPACE;
1492           dfa->nexts[clone_node] = dfa->nexts[org_node];
1493           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1494           if (BE (! ok, 0))
1495             return REG_ESPACE;
1496         }
1497       else if (dfa->edests[org_node].nelem == 0)
1498         {
1499           /* In case of the node can't epsilon-transit, don't duplicate the
1500              destination and store the original destination as the
1501              destination of the node.  */
1502           dfa->nexts[clone_node] = dfa->nexts[org_node];
1503           break;
1504         }
1505       else if (dfa->edests[org_node].nelem == 1)
1506         {
1507           /* In case of the node can epsilon-transit, and it has only one
1508              destination.  */
1509           org_dest = dfa->edests[org_node].elems[0];
1510           re_node_set_empty (dfa->edests + clone_node);
1511           /* If the node is root_node itself, it means the epsilon closure
1512              has a loop.  Then tie it to the destination of the root_node.  */
1513           if (org_node == root_node && clone_node != org_node)
1514             {
1515               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1516               if (BE (! ok, 0))
1517                 return REG_ESPACE;
1518               break;
1519             }
1520           /* In case the node has another constraint, append it.  */
1521           constraint |= dfa->nodes[org_node].constraint;
1522           clone_dest = duplicate_node (dfa, org_dest, constraint);
1523           if (BE (clone_dest == REG_MISSING, 0))
1524             return REG_ESPACE;
1525           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1526           if (BE (! ok, 0))
1527             return REG_ESPACE;
1528         }
1529       else /* dfa->edests[org_node].nelem == 2 */
1530         {
1531           /* In case of the node can epsilon-transit, and it has two
1532              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1533           org_dest = dfa->edests[org_node].elems[0];
1534           re_node_set_empty (dfa->edests + clone_node);
1535           /* Search for a duplicated node which satisfies the constraint.  */
1536           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1537           if (clone_dest == REG_MISSING)
1538             {
1539               /* There is no such duplicated node, create a new one.  */
1540               reg_errcode_t err;
1541               clone_dest = duplicate_node (dfa, org_dest, constraint);
1542               if (BE (clone_dest == REG_MISSING, 0))
1543                 return REG_ESPACE;
1544               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1545               if (BE (! ok, 0))
1546                 return REG_ESPACE;
1547               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1548                                             root_node, constraint);
1549               if (BE (err != REG_NOERROR, 0))
1550                 return err;
1551             }
1552           else
1553             {
1554               /* There is a duplicated node which satisfies the constraint,
1555                  use it to avoid infinite loop.  */
1556               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557               if (BE (! ok, 0))
1558                 return REG_ESPACE;
1559             }
1560
1561           org_dest = dfa->edests[org_node].elems[1];
1562           clone_dest = duplicate_node (dfa, org_dest, constraint);
1563           if (BE (clone_dest == REG_MISSING, 0))
1564             return REG_ESPACE;
1565           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566           if (BE (! ok, 0))
1567             return REG_ESPACE;
1568         }
1569       org_node = org_dest;
1570       clone_node = clone_dest;
1571     }
1572   return REG_NOERROR;
1573 }
1574
1575 /* Search for a node which is duplicated from the node ORG_NODE, and
1576    satisfies the constraint CONSTRAINT.  */
1577
1578 static Idx
1579 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1580                         unsigned int constraint)
1581 {
1582   Idx idx;
1583   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1584     {
1585       if (org_node == dfa->org_indices[idx]
1586           && constraint == dfa->nodes[idx].constraint)
1587         return idx; /* Found.  */
1588     }
1589   return REG_MISSING; /* Not found.  */
1590 }
1591
1592 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1593    Return the index of the new node, or REG_MISSING if insufficient storage is
1594    available.  */
1595
1596 static Idx
1597 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1598 {
1599   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1600   if (BE (dup_idx != REG_MISSING, 1))
1601     {
1602       dfa->nodes[dup_idx].constraint = constraint;
1603       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1604       dfa->nodes[dup_idx].duplicated = 1;
1605
1606       /* Store the index of the original node.  */
1607       dfa->org_indices[dup_idx] = org_idx;
1608     }
1609   return dup_idx;
1610 }
1611
1612 static reg_errcode_t
1613 calc_inveclosure (re_dfa_t *dfa)
1614 {
1615   Idx src, idx;
1616   bool ok;
1617   for (idx = 0; idx < dfa->nodes_len; ++idx)
1618     re_node_set_init_empty (dfa->inveclosures + idx);
1619
1620   for (src = 0; src < dfa->nodes_len; ++src)
1621     {
1622       Idx *elems = dfa->eclosures[src].elems;
1623       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1624         {
1625           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1626           if (BE (! ok, 0))
1627             return REG_ESPACE;
1628         }
1629     }
1630
1631   return REG_NOERROR;
1632 }
1633
1634 /* Calculate "eclosure" for all the node in DFA.  */
1635
1636 static reg_errcode_t
1637 calc_eclosure (re_dfa_t *dfa)
1638 {
1639   Idx node_idx;
1640   bool incomplete;
1641 #ifdef DEBUG
1642   assert (dfa->nodes_len > 0);
1643 #endif
1644   incomplete = false;
1645   /* For each nodes, calculate epsilon closure.  */
1646   for (node_idx = 0; ; ++node_idx)
1647     {
1648       reg_errcode_t err;
1649       re_node_set eclosure_elem;
1650       if (node_idx == dfa->nodes_len)
1651         {
1652           if (!incomplete)
1653             break;
1654           incomplete = false;
1655           node_idx = 0;
1656         }
1657
1658 #ifdef DEBUG
1659       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1660 #endif
1661
1662       /* If we have already calculated, skip it.  */
1663       if (dfa->eclosures[node_idx].nelem != 0)
1664         continue;
1665       /* Calculate epsilon closure of `node_idx'.  */
1666       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1667       if (BE (err != REG_NOERROR, 0))
1668         return err;
1669
1670       if (dfa->eclosures[node_idx].nelem == 0)
1671         {
1672           incomplete = true;
1673           re_node_set_free (&eclosure_elem);
1674         }
1675     }
1676   return REG_NOERROR;
1677 }
1678
1679 /* Calculate epsilon closure of NODE.  */
1680
1681 static reg_errcode_t
1682 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1683 {
1684   reg_errcode_t err;
1685   Idx i;
1686   re_node_set eclosure;
1687   bool ok;
1688   bool incomplete = false;
1689   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1690   if (BE (err != REG_NOERROR, 0))
1691     return err;
1692
1693   /* This indicates that we are calculating this node now.
1694      We reference this value to avoid infinite loop.  */
1695   dfa->eclosures[node].nelem = REG_MISSING;
1696
1697   /* If the current node has constraints, duplicate all nodes
1698      since they must inherit the constraints.  */
1699   if (dfa->nodes[node].constraint
1700       && dfa->edests[node].nelem
1701       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1702     {
1703       err = duplicate_node_closure (dfa, node, node, node,
1704                                     dfa->nodes[node].constraint);
1705       if (BE (err != REG_NOERROR, 0))
1706         return err;
1707     }
1708
1709   /* Expand each epsilon destination nodes.  */
1710   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1711     for (i = 0; i < dfa->edests[node].nelem; ++i)
1712       {
1713         re_node_set eclosure_elem;
1714         Idx edest = dfa->edests[node].elems[i];
1715         /* If calculating the epsilon closure of `edest' is in progress,
1716            return intermediate result.  */
1717         if (dfa->eclosures[edest].nelem == REG_MISSING)
1718           {
1719             incomplete = true;
1720             continue;
1721           }
1722         /* If we haven't calculated the epsilon closure of `edest' yet,
1723            calculate now. Otherwise use calculated epsilon closure.  */
1724         if (dfa->eclosures[edest].nelem == 0)
1725           {
1726             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1727             if (BE (err != REG_NOERROR, 0))
1728               return err;
1729           }
1730         else
1731           eclosure_elem = dfa->eclosures[edest];
1732         /* Merge the epsilon closure of `edest'.  */
1733         err = re_node_set_merge (&eclosure, &eclosure_elem);
1734         if (BE (err != REG_NOERROR, 0))
1735           return err;
1736         /* If the epsilon closure of `edest' is incomplete,
1737            the epsilon closure of this node is also incomplete.  */
1738         if (dfa->eclosures[edest].nelem == 0)
1739           {
1740             incomplete = true;
1741             re_node_set_free (&eclosure_elem);
1742           }
1743       }
1744
1745   /* An epsilon closure includes itself.  */
1746   ok = re_node_set_insert (&eclosure, node);
1747   if (BE (! ok, 0))
1748     return REG_ESPACE;
1749   if (incomplete && !root)
1750     dfa->eclosures[node].nelem = 0;
1751   else
1752     dfa->eclosures[node] = eclosure;
1753   *new_set = eclosure;
1754   return REG_NOERROR;
1755 }
1756 \f
1757 /* Functions for token which are used in the parser.  */
1758
1759 /* Fetch a token from INPUT.
1760    We must not use this function inside bracket expressions.  */
1761
1762 static void
1763 internal_function
1764 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1765 {
1766   re_string_skip_bytes (input, peek_token (result, input, syntax));
1767 }
1768
1769 /* Peek a token from INPUT, and return the length of the token.
1770    We must not use this function inside bracket expressions.  */
1771
1772 static int
1773 internal_function
1774 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1775 {
1776   unsigned char c;
1777
1778   if (re_string_eoi (input))
1779     {
1780       token->type = END_OF_RE;
1781       return 0;
1782     }
1783
1784   c = re_string_peek_byte (input, 0);
1785   token->opr.c = c;
1786
1787   token->word_char = 0;
1788 #ifdef RE_ENABLE_I18N
1789   token->mb_partial = 0;
1790   if (input->mb_cur_max > 1 &&
1791       !re_string_first_byte (input, re_string_cur_idx (input)))
1792     {
1793       token->type = CHARACTER;
1794       token->mb_partial = 1;
1795       return 1;
1796     }
1797 #endif
1798   if (c == '\\')
1799     {
1800       unsigned char c2;
1801       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1802         {
1803           token->type = BACK_SLASH;
1804           return 1;
1805         }
1806
1807       c2 = re_string_peek_byte_case (input, 1);
1808       token->opr.c = c2;
1809       token->type = CHARACTER;
1810 #ifdef RE_ENABLE_I18N
1811       if (input->mb_cur_max > 1)
1812         {
1813           wint_t wc = re_string_wchar_at (input,
1814                                           re_string_cur_idx (input) + 1);
1815           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1816         }
1817       else
1818 #endif
1819         token->word_char = IS_WORD_CHAR (c2) != 0;
1820
1821       switch (c2)
1822         {
1823         case '|':
1824           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1825             token->type = OP_ALT;
1826           break;
1827         case '1': case '2': case '3': case '4': case '5':
1828         case '6': case '7': case '8': case '9':
1829           if (!(syntax & RE_NO_BK_REFS))
1830             {
1831               token->type = OP_BACK_REF;
1832               token->opr.idx = c2 - '1';
1833             }
1834           break;
1835         case '<':
1836           if (!(syntax & RE_NO_GNU_OPS))
1837             {
1838               token->type = ANCHOR;
1839               token->opr.ctx_type = WORD_FIRST;
1840             }
1841           break;
1842         case '>':
1843           if (!(syntax & RE_NO_GNU_OPS))
1844             {
1845               token->type = ANCHOR;
1846               token->opr.ctx_type = WORD_LAST;
1847             }
1848           break;
1849         case 'b':
1850           if (!(syntax & RE_NO_GNU_OPS))
1851             {
1852               token->type = ANCHOR;
1853               token->opr.ctx_type = WORD_DELIM;
1854             }
1855           break;
1856         case 'B':
1857           if (!(syntax & RE_NO_GNU_OPS))
1858             {
1859               token->type = ANCHOR;
1860               token->opr.ctx_type = NOT_WORD_DELIM;
1861             }
1862           break;
1863         case 'w':
1864           if (!(syntax & RE_NO_GNU_OPS))
1865             token->type = OP_WORD;
1866           break;
1867         case 'W':
1868           if (!(syntax & RE_NO_GNU_OPS))
1869             token->type = OP_NOTWORD;
1870           break;
1871         case 's':
1872           if (!(syntax & RE_NO_GNU_OPS))
1873             token->type = OP_SPACE;
1874           break;
1875         case 'S':
1876           if (!(syntax & RE_NO_GNU_OPS))
1877             token->type = OP_NOTSPACE;
1878           break;
1879         case '`':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             {
1882               token->type = ANCHOR;
1883               token->opr.ctx_type = BUF_FIRST;
1884             }
1885           break;
1886         case '\'':
1887           if (!(syntax & RE_NO_GNU_OPS))
1888             {
1889               token->type = ANCHOR;
1890               token->opr.ctx_type = BUF_LAST;
1891             }
1892           break;
1893         case '(':
1894           if (!(syntax & RE_NO_BK_PARENS))
1895             token->type = OP_OPEN_SUBEXP;
1896           break;
1897         case ')':
1898           if (!(syntax & RE_NO_BK_PARENS))
1899             token->type = OP_CLOSE_SUBEXP;
1900           break;
1901         case '+':
1902           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1903             token->type = OP_DUP_PLUS;
1904           break;
1905         case '?':
1906           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907             token->type = OP_DUP_QUESTION;
1908           break;
1909         case '{':
1910           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1911             token->type = OP_OPEN_DUP_NUM;
1912           break;
1913         case '}':
1914           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915             token->type = OP_CLOSE_DUP_NUM;
1916           break;
1917         default:
1918           break;
1919         }
1920       return 2;
1921     }
1922
1923   token->type = CHARACTER;
1924 #ifdef RE_ENABLE_I18N
1925   if (input->mb_cur_max > 1)
1926     {
1927       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1928       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1929     }
1930   else
1931 #endif
1932     token->word_char = IS_WORD_CHAR (token->opr.c);
1933
1934   switch (c)
1935     {
1936     case '\n':
1937       if (syntax & RE_NEWLINE_ALT)
1938         token->type = OP_ALT;
1939       break;
1940     case '|':
1941       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1942         token->type = OP_ALT;
1943       break;
1944     case '*':
1945       token->type = OP_DUP_ASTERISK;
1946       break;
1947     case '+':
1948       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1949         token->type = OP_DUP_PLUS;
1950       break;
1951     case '?':
1952       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953         token->type = OP_DUP_QUESTION;
1954       break;
1955     case '{':
1956       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1957         token->type = OP_OPEN_DUP_NUM;
1958       break;
1959     case '}':
1960       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961         token->type = OP_CLOSE_DUP_NUM;
1962       break;
1963     case '(':
1964       if (syntax & RE_NO_BK_PARENS)
1965         token->type = OP_OPEN_SUBEXP;
1966       break;
1967     case ')':
1968       if (syntax & RE_NO_BK_PARENS)
1969         token->type = OP_CLOSE_SUBEXP;
1970       break;
1971     case '[':
1972       token->type = OP_OPEN_BRACKET;
1973       break;
1974     case '.':
1975       token->type = OP_PERIOD;
1976       break;
1977     case '^':
1978       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1979           re_string_cur_idx (input) != 0)
1980         {
1981           char prev = re_string_peek_byte (input, -1);
1982           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1983             break;
1984         }
1985       token->type = ANCHOR;
1986       token->opr.ctx_type = LINE_FIRST;
1987       break;
1988     case '$':
1989       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1990           re_string_cur_idx (input) + 1 != re_string_length (input))
1991         {
1992           re_token_t next;
1993           re_string_skip_bytes (input, 1);
1994           peek_token (&next, input, syntax);
1995           re_string_skip_bytes (input, -1);
1996           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1997             break;
1998         }
1999       token->type = ANCHOR;
2000       token->opr.ctx_type = LINE_LAST;
2001       break;
2002     default:
2003       break;
2004     }
2005   return 1;
2006 }
2007
2008 /* Peek a token from INPUT, and return the length of the token.
2009    We must not use this function out of bracket expressions.  */
2010
2011 static int
2012 internal_function
2013 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2014 {
2015   unsigned char c;
2016   if (re_string_eoi (input))
2017     {
2018       token->type = END_OF_RE;
2019       return 0;
2020     }
2021   c = re_string_peek_byte (input, 0);
2022   token->opr.c = c;
2023
2024 #ifdef RE_ENABLE_I18N
2025   if (input->mb_cur_max > 1 &&
2026       !re_string_first_byte (input, re_string_cur_idx (input)))
2027     {
2028       token->type = CHARACTER;
2029       return 1;
2030     }
2031 #endif /* RE_ENABLE_I18N */
2032
2033   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2034       && re_string_cur_idx (input) + 1 < re_string_length (input))
2035     {
2036       /* In this case, '\' escape a character.  */
2037       unsigned char c2;
2038       re_string_skip_bytes (input, 1);
2039       c2 = re_string_peek_byte (input, 0);
2040       token->opr.c = c2;
2041       token->type = CHARACTER;
2042       return 1;
2043     }
2044   if (c == '[') /* '[' is a special char in a bracket exps.  */
2045     {
2046       unsigned char c2;
2047       int token_len;
2048       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2049         c2 = re_string_peek_byte (input, 1);
2050       else
2051         c2 = 0;
2052       token->opr.c = c2;
2053       token_len = 2;
2054       switch (c2)
2055         {
2056         case '.':
2057           token->type = OP_OPEN_COLL_ELEM;
2058           break;
2059         case '=':
2060           token->type = OP_OPEN_EQUIV_CLASS;
2061           break;
2062         case ':':
2063           if (syntax & RE_CHAR_CLASSES)
2064             {
2065               token->type = OP_OPEN_CHAR_CLASS;
2066               break;
2067             }
2068           /* else fall through.  */
2069         default:
2070           token->type = CHARACTER;
2071           token->opr.c = c;
2072           token_len = 1;
2073           break;
2074         }
2075       return token_len;
2076     }
2077   switch (c)
2078     {
2079     case '-':
2080       token->type = OP_CHARSET_RANGE;
2081       break;
2082     case ']':
2083       token->type = OP_CLOSE_BRACKET;
2084       break;
2085     case '^':
2086       token->type = OP_NON_MATCH_LIST;
2087       break;
2088     default:
2089       token->type = CHARACTER;
2090     }
2091   return 1;
2092 }
2093 \f
2094 /* Functions for parser.  */
2095
2096 /* Entry point of the parser.
2097    Parse the regular expression REGEXP and return the structure tree.
2098    If an error is occured, ERR is set by error code, and return NULL.
2099    This function build the following tree, from regular expression <reg_exp>:
2100            CAT
2101            / \
2102           /   \
2103    <reg_exp>  EOR
2104
2105    CAT means concatenation.
2106    EOR means end of regular expression.  */
2107
2108 static bin_tree_t *
2109 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2110        reg_errcode_t *err)
2111 {
2112   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113   bin_tree_t *tree, *eor, *root;
2114   re_token_t current_token;
2115   dfa->syntax = syntax;
2116   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2117   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2118   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2119     return NULL;
2120   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2121   if (tree != NULL)
2122     root = create_tree (dfa, tree, eor, CONCAT);
2123   else
2124     root = eor;
2125   if (BE (eor == NULL || root == NULL, 0))
2126     {
2127       *err = REG_ESPACE;
2128       return NULL;
2129     }
2130   return root;
2131 }
2132
2133 /* This function build the following tree, from regular expression
2134    <branch1>|<branch2>:
2135            ALT
2136            / \
2137           /   \
2138    <branch1> <branch2>
2139
2140    ALT means alternative, which represents the operator `|'.  */
2141
2142 static bin_tree_t *
2143 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2144                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2145 {
2146   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147   bin_tree_t *tree, *branch = NULL;
2148   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2149   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2150     return NULL;
2151
2152   while (token->type == OP_ALT)
2153     {
2154       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155       if (token->type != OP_ALT && token->type != END_OF_RE
2156           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2157         {
2158           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2159           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2160             return NULL;
2161         }
2162       else
2163         branch = NULL;
2164       tree = create_tree (dfa, tree, branch, OP_ALT);
2165       if (BE (tree == NULL, 0))
2166         {
2167           *err = REG_ESPACE;
2168           return NULL;
2169         }
2170     }
2171   return tree;
2172 }
2173
2174 /* This function build the following tree, from regular expression
2175    <exp1><exp2>:
2176         CAT
2177         / \
2178        /   \
2179    <exp1> <exp2>
2180
2181    CAT means concatenation.  */
2182
2183 static bin_tree_t *
2184 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2185               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2186 {
2187   bin_tree_t *tree, *expr;
2188   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2189   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2190   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2191     return NULL;
2192
2193   while (token->type != OP_ALT && token->type != END_OF_RE
2194          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2195     {
2196       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2197       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2198         {
2199           return NULL;
2200         }
2201       if (tree != NULL && expr != NULL)
2202         {
2203           tree = create_tree (dfa, tree, expr, CONCAT);
2204           if (tree == NULL)
2205             {
2206               *err = REG_ESPACE;
2207               return NULL;
2208             }
2209         }
2210       else if (tree == NULL)
2211         tree = expr;
2212       /* Otherwise expr == NULL, we don't need to create new tree.  */
2213     }
2214   return tree;
2215 }
2216
2217 /* This function build the following tree, from regular expression a*:
2218          *
2219          |
2220          a
2221 */
2222
2223 static bin_tree_t *
2224 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2226 {
2227   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228   bin_tree_t *tree;
2229   switch (token->type)
2230     {
2231     case CHARACTER:
2232       tree = create_token_tree (dfa, NULL, NULL, token);
2233       if (BE (tree == NULL, 0))
2234         {
2235           *err = REG_ESPACE;
2236           return NULL;
2237         }
2238 #ifdef RE_ENABLE_I18N
2239       if (dfa->mb_cur_max > 1)
2240         {
2241           while (!re_string_eoi (regexp)
2242                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2243             {
2244               bin_tree_t *mbc_remain;
2245               fetch_token (token, regexp, syntax);
2246               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248               if (BE (mbc_remain == NULL || tree == NULL, 0))
2249                 {
2250                   *err = REG_ESPACE;
2251                   return NULL;
2252                 }
2253             }
2254         }
2255 #endif
2256       break;
2257     case OP_OPEN_SUBEXP:
2258       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260         return NULL;
2261       break;
2262     case OP_OPEN_BRACKET:
2263       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265         return NULL;
2266       break;
2267     case OP_BACK_REF:
2268       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2269         {
2270           *err = REG_ESUBREG;
2271           return NULL;
2272         }
2273       dfa->used_bkref_map |= 1 << token->opr.idx;
2274       tree = create_token_tree (dfa, NULL, NULL, token);
2275       if (BE (tree == NULL, 0))
2276         {
2277           *err = REG_ESPACE;
2278           return NULL;
2279         }
2280       ++dfa->nbackref;
2281       dfa->has_mb_node = 1;
2282       break;
2283     case OP_OPEN_DUP_NUM:
2284       if (syntax & RE_CONTEXT_INVALID_DUP)
2285         {
2286           *err = REG_BADRPT;
2287           return NULL;
2288         }
2289       /* FALLTHROUGH */
2290     case OP_DUP_ASTERISK:
2291     case OP_DUP_PLUS:
2292     case OP_DUP_QUESTION:
2293       if (syntax & RE_CONTEXT_INVALID_OPS)
2294         {
2295           *err = REG_BADRPT;
2296           return NULL;
2297         }
2298       else if (syntax & RE_CONTEXT_INDEP_OPS)
2299         {
2300           fetch_token (token, regexp, syntax);
2301           return parse_expression (regexp, preg, token, syntax, nest, err);
2302         }
2303       /* else fall through  */
2304     case OP_CLOSE_SUBEXP:
2305       if ((token->type == OP_CLOSE_SUBEXP) &&
2306           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2307         {
2308           *err = REG_ERPAREN;
2309           return NULL;
2310         }
2311       /* else fall through  */
2312     case OP_CLOSE_DUP_NUM:
2313       /* We treat it as a normal character.  */
2314
2315       /* Then we can these characters as normal characters.  */
2316       token->type = CHARACTER;
2317       /* mb_partial and word_char bits should be initialized already
2318          by peek_token.  */
2319       tree = create_token_tree (dfa, NULL, NULL, token);
2320       if (BE (tree == NULL, 0))
2321         {
2322           *err = REG_ESPACE;
2323           return NULL;
2324         }
2325       break;
2326     case ANCHOR:
2327       if ((token->opr.ctx_type
2328            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329           && dfa->word_ops_used == 0)
2330         init_word_char (dfa);
2331       if (token->opr.ctx_type == WORD_DELIM
2332           || token->opr.ctx_type == NOT_WORD_DELIM)
2333         {
2334           bin_tree_t *tree_first, *tree_last;
2335           if (token->opr.ctx_type == WORD_DELIM)
2336             {
2337               token->opr.ctx_type = WORD_FIRST;
2338               tree_first = create_token_tree (dfa, NULL, NULL, token);
2339               token->opr.ctx_type = WORD_LAST;
2340             }
2341           else
2342             {
2343               token->opr.ctx_type = INSIDE_WORD;
2344               tree_first = create_token_tree (dfa, NULL, NULL, token);
2345               token->opr.ctx_type = INSIDE_NOTWORD;
2346             }
2347           tree_last = create_token_tree (dfa, NULL, NULL, token);
2348           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2350             {
2351               *err = REG_ESPACE;
2352               return NULL;
2353             }
2354         }
2355       else
2356         {
2357           tree = create_token_tree (dfa, NULL, NULL, token);
2358           if (BE (tree == NULL, 0))
2359             {
2360               *err = REG_ESPACE;
2361               return NULL;
2362             }
2363         }
2364       /* We must return here, since ANCHORs can't be followed
2365          by repetition operators.
2366          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2368       fetch_token (token, regexp, syntax);
2369       return tree;
2370     case OP_PERIOD:
2371       tree = create_token_tree (dfa, NULL, NULL, token);
2372       if (BE (tree == NULL, 0))
2373         {
2374           *err = REG_ESPACE;
2375           return NULL;
2376         }
2377       if (dfa->mb_cur_max > 1)
2378         dfa->has_mb_node = 1;
2379       break;
2380     case OP_WORD:
2381     case OP_NOTWORD:
2382       tree = build_charclass_op (dfa, regexp->trans,
2383                                  (const unsigned char *) "alnum",
2384                                  (const unsigned char *) "_",
2385                                  token->type == OP_NOTWORD, err);
2386       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387         return NULL;
2388       break;
2389     case OP_SPACE:
2390     case OP_NOTSPACE:
2391       tree = build_charclass_op (dfa, regexp->trans,
2392                                  (const unsigned char *) "space",
2393                                  (const unsigned char *) "",
2394                                  token->type == OP_NOTSPACE, err);
2395       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396         return NULL;
2397       break;
2398     case OP_ALT:
2399     case END_OF_RE:
2400       return NULL;
2401     case BACK_SLASH:
2402       *err = REG_EESCAPE;
2403       return NULL;
2404     default:
2405       /* Must not happen?  */
2406 #ifdef DEBUG
2407       assert (0);
2408 #endif
2409       return NULL;
2410     }
2411   fetch_token (token, regexp, syntax);
2412
2413   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2415     {
2416       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418         return NULL;
2419       /* In BRE consecutive duplications are not allowed.  */
2420       if ((syntax & RE_CONTEXT_INVALID_DUP)
2421           && (token->type == OP_DUP_ASTERISK
2422               || token->type == OP_OPEN_DUP_NUM))
2423         {
2424           *err = REG_BADRPT;
2425           return NULL;
2426         }
2427     }
2428
2429   return tree;
2430 }
2431
2432 /* This function build the following tree, from regular expression
2433    (<reg_exp>):
2434          SUBEXP
2435             |
2436         <reg_exp>
2437 */
2438
2439 static bin_tree_t *
2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2442 {
2443   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444   bin_tree_t *tree;
2445   size_t cur_nsub;
2446   cur_nsub = preg->re_nsub++;
2447
2448   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2449
2450   /* The subexpression may be a null string.  */
2451   if (token->type == OP_CLOSE_SUBEXP)
2452     tree = NULL;
2453   else
2454     {
2455       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457         *err = REG_EPAREN;
2458       if (BE (*err != REG_NOERROR, 0))
2459         return NULL;
2460     }
2461
2462   if (cur_nsub <= '9' - '1')
2463     dfa->completed_bkref_map |= 1 << cur_nsub;
2464
2465   tree = create_tree (dfa, tree, NULL, SUBEXP);
2466   if (BE (tree == NULL, 0))
2467     {
2468       *err = REG_ESPACE;
2469       return NULL;
2470     }
2471   tree->token.opr.idx = cur_nsub;
2472   return tree;
2473 }
2474
2475 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2476
2477 static bin_tree_t *
2478 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2479               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2480 {
2481   bin_tree_t *tree = NULL, *old_tree = NULL;
2482   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2483   re_token_t start_token = *token;
2484
2485   if (token->type == OP_OPEN_DUP_NUM)
2486     {
2487       end = 0;
2488       start = fetch_number (regexp, token, syntax);
2489       if (start == REG_MISSING)
2490         {
2491           if (token->type == CHARACTER && token->opr.c == ',')
2492             start = 0; /* We treat "{,m}" as "{0,m}".  */
2493           else
2494             {
2495               *err = REG_BADBR; /* <re>{} is invalid.  */
2496               return NULL;
2497             }
2498         }
2499       if (BE (start != REG_ERROR, 1))
2500         {
2501           /* We treat "{n}" as "{n,n}".  */
2502           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2503                  : ((token->type == CHARACTER && token->opr.c == ',')
2504                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2505         }
2506       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2507         {
2508           /* Invalid sequence.  */
2509           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2510             {
2511               if (token->type == END_OF_RE)
2512                 *err = REG_EBRACE;
2513               else
2514                 *err = REG_BADBR;
2515
2516               return NULL;
2517             }
2518
2519           /* If the syntax bit is set, rollback.  */
2520           re_string_set_index (regexp, start_idx);
2521           *token = start_token;
2522           token->type = CHARACTER;
2523           /* mb_partial and word_char bits should be already initialized by
2524              peek_token.  */
2525           return elem;
2526         }
2527
2528       if (BE ((end != REG_MISSING && start > end)
2529               || token->type != OP_CLOSE_DUP_NUM, 0))
2530         {
2531           /* First number greater than second.  */
2532           *err = REG_BADBR;
2533           return NULL;
2534         }
2535     }
2536   else
2537     {
2538       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2539       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2540     }
2541
2542   fetch_token (token, regexp, syntax);
2543
2544   if (BE (elem == NULL, 0))
2545     return NULL;
2546   if (BE (start == 0 && end == 0, 0))
2547     {
2548       postorder (elem, free_tree, NULL);
2549       return NULL;
2550     }
2551
2552   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2553   if (BE (start > 0, 0))
2554     {
2555       tree = elem;
2556       for (i = 2; i <= start; ++i)
2557         {
2558           elem = duplicate_tree (elem, dfa);
2559           tree = create_tree (dfa, tree, elem, CONCAT);
2560           if (BE (elem == NULL || tree == NULL, 0))
2561             goto parse_dup_op_espace;
2562         }
2563
2564       if (start == end)
2565         return tree;
2566
2567       /* Duplicate ELEM before it is marked optional.  */
2568       elem = duplicate_tree (elem, dfa);
2569       old_tree = tree;
2570     }
2571   else
2572     old_tree = NULL;
2573
2574   if (elem->token.type == SUBEXP)
2575     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2576
2577   tree = create_tree (dfa, elem, NULL,
2578                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2579   if (BE (tree == NULL, 0))
2580     goto parse_dup_op_espace;
2581
2582 /* From gnulib's "intprops.h":
2583    True if the arithmetic type T is signed.  */
2584 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2585
2586   /* This loop is actually executed only when end != REG_MISSING,
2587      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2588      already created the start+1-th copy.  */
2589   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2590     for (i = start + 2; i <= end; ++i)
2591       {
2592         elem = duplicate_tree (elem, dfa);
2593         tree = create_tree (dfa, tree, elem, CONCAT);
2594         if (BE (elem == NULL || tree == NULL, 0))
2595           goto parse_dup_op_espace;
2596
2597         tree = create_tree (dfa, tree, NULL, OP_ALT);
2598         if (BE (tree == NULL, 0))
2599           goto parse_dup_op_espace;
2600       }
2601
2602   if (old_tree)
2603     tree = create_tree (dfa, old_tree, tree, CONCAT);
2604
2605   return tree;
2606
2607  parse_dup_op_espace:
2608   *err = REG_ESPACE;
2609   return NULL;
2610 }
2611
2612 /* Size of the names for collating symbol/equivalence_class/character_class.
2613    I'm not sure, but maybe enough.  */
2614 #define BRACKET_NAME_BUF_SIZE 32
2615
2616 #ifndef _LIBC
2617   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2618      Build the range expression which starts from START_ELEM, and ends
2619      at END_ELEM.  The result are written to MBCSET and SBCSET.
2620      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2621      mbcset->range_ends, is a pointer argument sinse we may
2622      update it.  */
2623
2624 static reg_errcode_t
2625 internal_function
2626 # ifdef RE_ENABLE_I18N
2627 build_range_exp (const reg_syntax_t syntax,
2628                  bitset_t sbcset,
2629                  re_charset_t *mbcset,
2630                  Idx *range_alloc,
2631                  const bracket_elem_t *start_elem,
2632                  const bracket_elem_t *end_elem)
2633 # else /* not RE_ENABLE_I18N */
2634 build_range_exp (const reg_syntax_t syntax,
2635                  bitset_t sbcset,
2636                  const bracket_elem_t *start_elem,
2637                  const bracket_elem_t *end_elem)
2638 # endif /* not RE_ENABLE_I18N */
2639 {
2640   unsigned int start_ch, end_ch;
2641   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2642   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2643           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2644           0))
2645     return REG_ERANGE;
2646
2647   /* We can handle no multi character collating elements without libc
2648      support.  */
2649   if (BE ((start_elem->type == COLL_SYM
2650            && strlen ((char *) start_elem->opr.name) > 1)
2651           || (end_elem->type == COLL_SYM
2652               && strlen ((char *) end_elem->opr.name) > 1), 0))
2653     return REG_ECOLLATE;
2654
2655 # ifdef RE_ENABLE_I18N
2656   {
2657     wchar_t wc;
2658     wint_t start_wc;
2659     wint_t end_wc;
2660     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2661
2662     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2663                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2664                    : 0));
2665     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2666               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2667                  : 0));
2668     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2669                 ? __btowc (start_ch) : start_elem->opr.wch);
2670     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2671               ? __btowc (end_ch) : end_elem->opr.wch);
2672     if (start_wc == WEOF || end_wc == WEOF)
2673       return REG_ECOLLATE;
2674     cmp_buf[0] = start_wc;
2675     cmp_buf[4] = end_wc;
2676
2677     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2678             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2679       return REG_ERANGE;
2680
2681     /* Got valid collation sequence values, add them as a new entry.
2682        However, for !_LIBC we have no collation elements: if the
2683        character set is single byte, the single byte character set
2684        that we build below suffices.  parse_bracket_exp passes
2685        no MBCSET if dfa->mb_cur_max == 1.  */
2686     if (mbcset)
2687       {
2688         /* Check the space of the arrays.  */
2689         if (BE (*range_alloc == mbcset->nranges, 0))
2690           {
2691             /* There is not enough space, need realloc.  */
2692             wchar_t *new_array_start, *new_array_end;
2693             Idx new_nranges;
2694
2695             /* +1 in case of mbcset->nranges is 0.  */
2696             new_nranges = 2 * mbcset->nranges + 1;
2697             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2698                are NULL if *range_alloc == 0.  */
2699             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2700                                           new_nranges);
2701             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2702                                         new_nranges);
2703
2704             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2705               return REG_ESPACE;
2706
2707             mbcset->range_starts = new_array_start;
2708             mbcset->range_ends = new_array_end;
2709             *range_alloc = new_nranges;
2710           }
2711
2712         mbcset->range_starts[mbcset->nranges] = start_wc;
2713         mbcset->range_ends[mbcset->nranges++] = end_wc;
2714       }
2715
2716     /* Build the table for single byte characters.  */
2717     for (wc = 0; wc < SBC_MAX; ++wc)
2718       {
2719         cmp_buf[2] = wc;
2720         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2721             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2722           bitset_set (sbcset, wc);
2723       }
2724   }
2725 # else /* not RE_ENABLE_I18N */
2726   {
2727     unsigned int ch;
2728     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2729                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2730                    : 0));
2731     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2732               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2733                  : 0));
2734     if (start_ch > end_ch)
2735       return REG_ERANGE;
2736     /* Build the table for single byte characters.  */
2737     for (ch = 0; ch < SBC_MAX; ++ch)
2738       if (start_ch <= ch  && ch <= end_ch)
2739         bitset_set (sbcset, ch);
2740   }
2741 # endif /* not RE_ENABLE_I18N */
2742   return REG_NOERROR;
2743 }
2744 #endif /* not _LIBC */
2745
2746 #ifndef _LIBC
2747 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2748    Build the collating element which is represented by NAME.
2749    The result are written to MBCSET and SBCSET.
2750    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2751    pointer argument since we may update it.  */
2752
2753 static reg_errcode_t
2754 internal_function
2755 build_collating_symbol (bitset_t sbcset,
2756 # ifdef RE_ENABLE_I18N
2757                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2758 # endif
2759                         const unsigned char *name)
2760 {
2761   size_t name_len = strlen ((const char *) name);
2762   if (BE (name_len != 1, 0))
2763     return REG_ECOLLATE;
2764   else
2765     {
2766       bitset_set (sbcset, name[0]);
2767       return REG_NOERROR;
2768     }
2769 }
2770 #endif /* not _LIBC */
2771
2772 /* This function parse bracket expression like "[abc]", "[a-c]",
2773    "[[.a-a.]]" etc.  */
2774
2775 static bin_tree_t *
2776 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2777                    reg_syntax_t syntax, reg_errcode_t *err)
2778 {
2779 #ifdef _LIBC
2780   const unsigned char *collseqmb;
2781   const char *collseqwc;
2782   uint32_t nrules;
2783   int32_t table_size;
2784   const int32_t *symb_table;
2785   const unsigned char *extra;
2786
2787   /* Local function for parse_bracket_exp used in _LIBC environement.
2788      Seek the collating symbol entry correspondings to NAME.
2789      Return the index of the symbol in the SYMB_TABLE.  */
2790
2791   auto inline int32_t
2792   __attribute ((always_inline))
2793   seek_collating_symbol_entry (name, name_len)
2794          const unsigned char *name;
2795          size_t name_len;
2796     {
2797       int32_t hash = elem_hash ((const char *) name, name_len);
2798       int32_t elem = hash % table_size;
2799       if (symb_table[2 * elem] != 0)
2800         {
2801           int32_t second = hash % (table_size - 2) + 1;
2802
2803           do
2804             {
2805               /* First compare the hashing value.  */
2806               if (symb_table[2 * elem] == hash
2807                   /* Compare the length of the name.  */
2808                   && name_len == extra[symb_table[2 * elem + 1]]
2809                   /* Compare the name.  */
2810                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2811                              name_len) == 0)
2812                 {
2813                   /* Yep, this is the entry.  */
2814                   break;
2815                 }
2816
2817               /* Next entry.  */
2818               elem += second;
2819             }
2820           while (symb_table[2 * elem] != 0);
2821         }
2822       return elem;
2823     }
2824
2825   /* Local function for parse_bracket_exp used in _LIBC environment.
2826      Look up the collation sequence value of BR_ELEM.
2827      Return the value if succeeded, UINT_MAX otherwise.  */
2828
2829   auto inline unsigned int
2830   __attribute ((always_inline))
2831   lookup_collation_sequence_value (br_elem)
2832          bracket_elem_t *br_elem;
2833     {
2834       if (br_elem->type == SB_CHAR)
2835         {
2836           /*
2837           if (MB_CUR_MAX == 1)
2838           */
2839           if (nrules == 0)
2840             return collseqmb[br_elem->opr.ch];
2841           else
2842             {
2843               wint_t wc = __btowc (br_elem->opr.ch);
2844               return __collseq_table_lookup (collseqwc, wc);
2845             }
2846         }
2847       else if (br_elem->type == MB_CHAR)
2848         {
2849           if (nrules != 0)
2850             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2851         }
2852       else if (br_elem->type == COLL_SYM)
2853         {
2854           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2855           if (nrules != 0)
2856             {
2857               int32_t elem, idx;
2858               elem = seek_collating_symbol_entry (br_elem->opr.name,
2859                                                   sym_name_len);
2860               if (symb_table[2 * elem] != 0)
2861                 {
2862                   /* We found the entry.  */
2863                   idx = symb_table[2 * elem + 1];
2864                   /* Skip the name of collating element name.  */
2865                   idx += 1 + extra[idx];
2866                   /* Skip the byte sequence of the collating element.  */
2867                   idx += 1 + extra[idx];
2868                   /* Adjust for the alignment.  */
2869                   idx = (idx + 3) & ~3;
2870                   /* Skip the multibyte collation sequence value.  */
2871                   idx += sizeof (unsigned int);
2872                   /* Skip the wide char sequence of the collating element.  */
2873                   idx += sizeof (unsigned int) *
2874                     (1 + *(unsigned int *) (extra + idx));
2875                   /* Return the collation sequence value.  */
2876                   return *(unsigned int *) (extra + idx);
2877                 }
2878               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2879                 {
2880                   /* No valid character.  Match it as a single byte
2881                      character.  */
2882                   return collseqmb[br_elem->opr.name[0]];
2883                 }
2884             }
2885           else if (sym_name_len == 1)
2886             return collseqmb[br_elem->opr.name[0]];
2887         }
2888       return UINT_MAX;
2889     }
2890
2891   /* Local function for parse_bracket_exp used in _LIBC environement.
2892      Build the range expression which starts from START_ELEM, and ends
2893      at END_ELEM.  The result are written to MBCSET and SBCSET.
2894      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2895      mbcset->range_ends, is a pointer argument sinse we may
2896      update it.  */
2897
2898   auto inline reg_errcode_t
2899   __attribute ((always_inline))
2900   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2901          re_charset_t *mbcset;
2902          Idx *range_alloc;
2903          bitset_t sbcset;
2904          bracket_elem_t *start_elem, *end_elem;
2905     {
2906       unsigned int ch;
2907       uint32_t start_collseq;
2908       uint32_t end_collseq;
2909
2910       /* Equivalence Classes and Character Classes can't be a range
2911          start/end.  */
2912       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2913               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2914               0))
2915         return REG_ERANGE;
2916
2917       start_collseq = lookup_collation_sequence_value (start_elem);
2918       end_collseq = lookup_collation_sequence_value (end_elem);
2919       /* Check start/end collation sequence values.  */
2920       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2921         return REG_ECOLLATE;
2922       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2923         return REG_ERANGE;
2924
2925       /* Got valid collation sequence values, add them as a new entry.
2926          However, if we have no collation elements, and the character set
2927          is single byte, the single byte character set that we
2928          build below suffices. */
2929       if (nrules > 0 || dfa->mb_cur_max > 1)
2930         {
2931           /* Check the space of the arrays.  */
2932           if (BE (*range_alloc == mbcset->nranges, 0))
2933             {
2934               /* There is not enough space, need realloc.  */
2935               uint32_t *new_array_start;
2936               uint32_t *new_array_end;
2937               Idx new_nranges;
2938
2939               /* +1 in case of mbcset->nranges is 0.  */
2940               new_nranges = 2 * mbcset->nranges + 1;
2941               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2942                                             new_nranges);
2943               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2944                                           new_nranges);
2945
2946               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2947                 return REG_ESPACE;
2948
2949               mbcset->range_starts = new_array_start;
2950               mbcset->range_ends = new_array_end;
2951               *range_alloc = new_nranges;
2952             }
2953
2954           mbcset->range_starts[mbcset->nranges] = start_collseq;
2955           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2956         }
2957
2958       /* Build the table for single byte characters.  */
2959       for (ch = 0; ch < SBC_MAX; ch++)
2960         {
2961           uint32_t ch_collseq;
2962           /*
2963           if (MB_CUR_MAX == 1)
2964           */
2965           if (nrules == 0)
2966             ch_collseq = collseqmb[ch];
2967           else
2968             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2969           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2970             bitset_set (sbcset, ch);
2971         }
2972       return REG_NOERROR;
2973     }
2974
2975   /* Local function for parse_bracket_exp used in _LIBC environement.
2976      Build the collating element which is represented by NAME.
2977      The result are written to MBCSET and SBCSET.
2978      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2979      pointer argument sinse we may update it.  */
2980
2981   auto inline reg_errcode_t
2982   __attribute ((always_inline))
2983   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2984          re_charset_t *mbcset;
2985          Idx *coll_sym_alloc;
2986          bitset_t sbcset;
2987          const unsigned char *name;
2988     {
2989       int32_t elem, idx;
2990       size_t name_len = strlen ((const char *) name);
2991       if (nrules != 0)
2992         {
2993           elem = seek_collating_symbol_entry (name, name_len);
2994           if (symb_table[2 * elem] != 0)
2995             {
2996               /* We found the entry.  */
2997               idx = symb_table[2 * elem + 1];
2998               /* Skip the name of collating element name.  */
2999               idx += 1 + extra[idx];
3000             }
3001           else if (symb_table[2 * elem] == 0 && name_len == 1)
3002             {
3003               /* No valid character, treat it as a normal
3004                  character.  */
3005               bitset_set (sbcset, name[0]);
3006               return REG_NOERROR;
3007             }
3008           else
3009             return REG_ECOLLATE;
3010
3011           /* Got valid collation sequence, add it as a new entry.  */
3012           /* Check the space of the arrays.  */
3013           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3014             {
3015               /* Not enough, realloc it.  */
3016               /* +1 in case of mbcset->ncoll_syms is 0.  */
3017               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3018               /* Use realloc since mbcset->coll_syms is NULL
3019                  if *alloc == 0.  */
3020               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3021                                                    new_coll_sym_alloc);
3022               if (BE (new_coll_syms == NULL, 0))
3023                 return REG_ESPACE;
3024               mbcset->coll_syms = new_coll_syms;
3025               *coll_sym_alloc = new_coll_sym_alloc;
3026             }
3027           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3028           return REG_NOERROR;
3029         }
3030       else
3031         {
3032           if (BE (name_len != 1, 0))
3033             return REG_ECOLLATE;
3034           else
3035             {
3036               bitset_set (sbcset, name[0]);
3037               return REG_NOERROR;
3038             }
3039         }
3040     }
3041 #endif
3042
3043   re_token_t br_token;
3044   re_bitset_ptr_t sbcset;
3045 #ifdef RE_ENABLE_I18N
3046   re_charset_t *mbcset;
3047   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3048   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3049 #endif /* not RE_ENABLE_I18N */
3050   bool non_match = false;
3051   bin_tree_t *work_tree;
3052   int token_len;
3053   bool first_round = true;
3054 #ifdef _LIBC
3055   collseqmb = (const unsigned char *)
3056     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3057   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3058   if (nrules)
3059     {
3060       /*
3061       if (MB_CUR_MAX > 1)
3062       */
3063       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3064       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3065       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3066                                                   _NL_COLLATE_SYMB_TABLEMB);
3067       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3068                                                    _NL_COLLATE_SYMB_EXTRAMB);
3069     }
3070 #endif
3071   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3072 #ifdef RE_ENABLE_I18N
3073   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3074 #endif /* RE_ENABLE_I18N */
3075 #ifdef RE_ENABLE_I18N
3076   if (BE (sbcset == NULL || mbcset == NULL, 0))
3077 #else
3078   if (BE (sbcset == NULL, 0))
3079 #endif /* RE_ENABLE_I18N */
3080     {
3081       *err = REG_ESPACE;
3082       return NULL;
3083     }
3084
3085   token_len = peek_token_bracket (token, regexp, syntax);
3086   if (BE (token->type == END_OF_RE, 0))
3087     {
3088       *err = REG_BADPAT;
3089       goto parse_bracket_exp_free_return;
3090     }
3091   if (token->type == OP_NON_MATCH_LIST)
3092     {
3093 #ifdef RE_ENABLE_I18N
3094       mbcset->non_match = 1;
3095 #endif /* not RE_ENABLE_I18N */
3096       non_match = true;
3097       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3098         bitset_set (sbcset, '\n');
3099       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3100       token_len = peek_token_bracket (token, regexp, syntax);
3101       if (BE (token->type == END_OF_RE, 0))
3102         {
3103           *err = REG_BADPAT;
3104           goto parse_bracket_exp_free_return;
3105         }
3106     }
3107
3108   /* We treat the first ']' as a normal character.  */
3109   if (token->type == OP_CLOSE_BRACKET)
3110     token->type = CHARACTER;
3111
3112   while (1)
3113     {
3114       bracket_elem_t start_elem, end_elem;
3115       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3116       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3117       reg_errcode_t ret;
3118       int token_len2 = 0;
3119       bool is_range_exp = false;
3120       re_token_t token2;
3121
3122       start_elem.opr.name = start_name_buf;
3123       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3124                                    syntax, first_round);
3125       if (BE (ret != REG_NOERROR, 0))
3126         {
3127           *err = ret;
3128           goto parse_bracket_exp_free_return;
3129         }
3130       first_round = false;
3131
3132       /* Get information about the next token.  We need it in any case.  */
3133       token_len = peek_token_bracket (token, regexp, syntax);
3134
3135       /* Do not check for ranges if we know they are not allowed.  */
3136       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3137         {
3138           if (BE (token->type == END_OF_RE, 0))
3139             {
3140               *err = REG_EBRACK;
3141               goto parse_bracket_exp_free_return;
3142             }
3143           if (token->type == OP_CHARSET_RANGE)
3144             {
3145               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3146               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3147               if (BE (token2.type == END_OF_RE, 0))
3148                 {
3149                   *err = REG_EBRACK;
3150                   goto parse_bracket_exp_free_return;
3151                 }
3152               if (token2.type == OP_CLOSE_BRACKET)
3153                 {
3154                   /* We treat the last '-' as a normal character.  */
3155                   re_string_skip_bytes (regexp, -token_len);
3156                   token->type = CHARACTER;
3157                 }
3158               else
3159                 is_range_exp = true;
3160             }
3161         }
3162
3163       if (is_range_exp == true)
3164         {
3165           end_elem.opr.name = end_name_buf;
3166           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3167                                        dfa, syntax, true);
3168           if (BE (ret != REG_NOERROR, 0))
3169             {
3170               *err = ret;
3171               goto parse_bracket_exp_free_return;
3172             }
3173
3174           token_len = peek_token_bracket (token, regexp, syntax);
3175
3176 #ifdef _LIBC
3177           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3178                                   &start_elem, &end_elem);
3179 #else
3180 # ifdef RE_ENABLE_I18N
3181           *err = build_range_exp (syntax, sbcset,
3182                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3183                                   &range_alloc, &start_elem, &end_elem);
3184 # else
3185           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3186 # endif
3187 #endif /* RE_ENABLE_I18N */
3188           if (BE (*err != REG_NOERROR, 0))
3189             goto parse_bracket_exp_free_return;
3190         }
3191       else
3192         {
3193           switch (start_elem.type)
3194             {
3195             case SB_CHAR:
3196               bitset_set (sbcset, start_elem.opr.ch);
3197               break;
3198 #ifdef RE_ENABLE_I18N
3199             case MB_CHAR:
3200               /* Check whether the array has enough space.  */
3201               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3202                 {
3203                   wchar_t *new_mbchars;
3204                   /* Not enough, realloc it.  */
3205                   /* +1 in case of mbcset->nmbchars is 0.  */
3206                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3207                   /* Use realloc since array is NULL if *alloc == 0.  */
3208                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3209                                             mbchar_alloc);
3210                   if (BE (new_mbchars == NULL, 0))
3211                     goto parse_bracket_exp_espace;
3212                   mbcset->mbchars = new_mbchars;
3213                 }
3214               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3215               break;
3216 #endif /* RE_ENABLE_I18N */
3217             case EQUIV_CLASS:
3218               *err = build_equiv_class (sbcset,
3219 #ifdef RE_ENABLE_I18N
3220                                         mbcset, &equiv_class_alloc,
3221 #endif /* RE_ENABLE_I18N */
3222                                         start_elem.opr.name);
3223               if (BE (*err != REG_NOERROR, 0))
3224                 goto parse_bracket_exp_free_return;
3225               break;
3226             case COLL_SYM:
3227               *err = build_collating_symbol (sbcset,
3228 #ifdef RE_ENABLE_I18N
3229                                              mbcset, &coll_sym_alloc,
3230 #endif /* RE_ENABLE_I18N */
3231                                              start_elem.opr.name);
3232               if (BE (*err != REG_NOERROR, 0))
3233                 goto parse_bracket_exp_free_return;
3234               break;
3235             case CHAR_CLASS:
3236               *err = build_charclass (regexp->trans, sbcset,
3237 #ifdef RE_ENABLE_I18N
3238                                       mbcset, &char_class_alloc,
3239 #endif /* RE_ENABLE_I18N */
3240                                       start_elem.opr.name, syntax);
3241               if (BE (*err != REG_NOERROR, 0))
3242                goto parse_bracket_exp_free_return;
3243               break;
3244             default:
3245               assert (0);
3246               break;
3247             }
3248         }
3249       if (BE (token->type == END_OF_RE, 0))
3250         {
3251           *err = REG_EBRACK;
3252           goto parse_bracket_exp_free_return;
3253         }
3254       if (token->type == OP_CLOSE_BRACKET)
3255         break;
3256     }
3257
3258   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3259
3260   /* If it is non-matching list.  */
3261   if (non_match)
3262     bitset_not (sbcset);
3263
3264 #ifdef RE_ENABLE_I18N
3265   /* Ensure only single byte characters are set.  */
3266   if (dfa->mb_cur_max > 1)
3267     bitset_mask (sbcset, dfa->sb_char);
3268
3269   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3270       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3271                                                      || mbcset->non_match)))
3272     {
3273       bin_tree_t *mbc_tree;
3274       int sbc_idx;
3275       /* Build a tree for complex bracket.  */
3276       dfa->has_mb_node = 1;
3277       br_token.type = COMPLEX_BRACKET;
3278       br_token.opr.mbcset = mbcset;
3279       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3280       if (BE (mbc_tree == NULL, 0))
3281         goto parse_bracket_exp_espace;
3282       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3283         if (sbcset[sbc_idx])
3284           break;
3285       /* If there are no bits set in sbcset, there is no point
3286          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3287       if (sbc_idx < BITSET_WORDS)
3288         {
3289           /* Build a tree for simple bracket.  */
3290           br_token.type = SIMPLE_BRACKET;
3291           br_token.opr.sbcset = sbcset;
3292           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3293           if (BE (work_tree == NULL, 0))
3294             goto parse_bracket_exp_espace;
3295
3296           /* Then join them by ALT node.  */
3297           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3298           if (BE (work_tree == NULL, 0))
3299             goto parse_bracket_exp_espace;
3300         }
3301       else
3302         {
3303           re_free (sbcset);
3304           work_tree = mbc_tree;
3305         }
3306     }
3307   else
3308 #endif /* not RE_ENABLE_I18N */
3309     {
3310 #ifdef RE_ENABLE_I18N
3311       free_charset (mbcset);
3312 #endif
3313       /* Build a tree for simple bracket.  */
3314       br_token.type = SIMPLE_BRACKET;
3315       br_token.opr.sbcset = sbcset;
3316       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3317       if (BE (work_tree == NULL, 0))
3318         goto parse_bracket_exp_espace;
3319     }
3320   return work_tree;
3321
3322  parse_bracket_exp_espace:
3323   *err = REG_ESPACE;
3324  parse_bracket_exp_free_return:
3325   re_free (sbcset);
3326 #ifdef RE_ENABLE_I18N
3327   free_charset (mbcset);
3328 #endif /* RE_ENABLE_I18N */
3329   return NULL;
3330 }
3331
3332 /* Parse an element in the bracket expression.  */
3333
3334 static reg_errcode_t
3335 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3336                        re_token_t *token, int token_len, re_dfa_t *dfa,
3337                        reg_syntax_t syntax, bool accept_hyphen)
3338 {
3339 #ifdef RE_ENABLE_I18N
3340   int cur_char_size;
3341   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3342   if (cur_char_size > 1)
3343     {
3344       elem->type = MB_CHAR;
3345       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3346       re_string_skip_bytes (regexp, cur_char_size);
3347       return REG_NOERROR;
3348     }
3349 #endif /* RE_ENABLE_I18N */
3350   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3351   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3352       || token->type == OP_OPEN_EQUIV_CLASS)
3353     return parse_bracket_symbol (elem, regexp, token);
3354   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3355     {
3356       /* A '-' must only appear as anything but a range indicator before
3357          the closing bracket.  Everything else is an error.  */
3358       re_token_t token2;
3359       (void) peek_token_bracket (&token2, regexp, syntax);
3360       if (token2.type != OP_CLOSE_BRACKET)
3361         /* The actual error value is not standardized since this whole
3362            case is undefined.  But ERANGE makes good sense.  */
3363         return REG_ERANGE;
3364     }
3365   elem->type = SB_CHAR;
3366   elem->opr.ch = token->opr.c;
3367   return REG_NOERROR;
3368 }
3369
3370 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3371    such as [:<character_class>:], [.<collating_element>.], and
3372    [=<equivalent_class>=].  */
3373
3374 static reg_errcode_t
3375 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3376                       re_token_t *token)
3377 {
3378   unsigned char ch, delim = token->opr.c;
3379   int i = 0;
3380   if (re_string_eoi(regexp))
3381     return REG_EBRACK;
3382   for (;; ++i)
3383     {
3384       if (i >= BRACKET_NAME_BUF_SIZE)
3385         return REG_EBRACK;
3386       if (token->type == OP_OPEN_CHAR_CLASS)
3387         ch = re_string_fetch_byte_case (regexp);
3388       else
3389         ch = re_string_fetch_byte (regexp);
3390       if (re_string_eoi(regexp))
3391         return REG_EBRACK;
3392       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3393         break;
3394       elem->opr.name[i] = ch;
3395     }
3396   re_string_skip_bytes (regexp, 1);
3397   elem->opr.name[i] = '\0';
3398   switch (token->type)
3399     {
3400     case OP_OPEN_COLL_ELEM:
3401       elem->type = COLL_SYM;
3402       break;
3403     case OP_OPEN_EQUIV_CLASS:
3404       elem->type = EQUIV_CLASS;
3405       break;
3406     case OP_OPEN_CHAR_CLASS:
3407       elem->type = CHAR_CLASS;
3408       break;
3409     default:
3410       break;
3411     }
3412   return REG_NOERROR;
3413 }
3414
3415   /* Helper function for parse_bracket_exp.
3416      Build the equivalence class which is represented by NAME.
3417      The result are written to MBCSET and SBCSET.
3418      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3419      is a pointer argument sinse we may update it.  */
3420
3421 static reg_errcode_t
3422 #ifdef RE_ENABLE_I18N
3423 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3424                    Idx *equiv_class_alloc, const unsigned char *name)
3425 #else /* not RE_ENABLE_I18N */
3426 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3427 #endif /* not RE_ENABLE_I18N */
3428 {
3429 #ifdef _LIBC
3430   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3431   if (nrules != 0)
3432     {
3433       const int32_t *table, *indirect;
3434       const unsigned char *weights, *extra, *cp;
3435       unsigned char char_buf[2];
3436       int32_t idx1, idx2;
3437       unsigned int ch;
3438       size_t len;
3439       /* This #include defines a local function!  */
3440 # include <locale/weight.h>
3441       /* Calculate the index for equivalence class.  */
3442       cp = name;
3443       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3444       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3445                                                _NL_COLLATE_WEIGHTMB);
3446       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3447                                                    _NL_COLLATE_EXTRAMB);
3448       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3449                                                 _NL_COLLATE_INDIRECTMB);
3450       idx1 = findidx (&cp);
3451       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3452         /* This isn't a valid character.  */
3453         return REG_ECOLLATE;
3454
3455       /* Build single byte matcing table for this equivalence class.  */
3456       char_buf[1] = (unsigned char) '\0';
3457       len = weights[idx1 & 0xffffff];
3458       for (ch = 0; ch < SBC_MAX; ++ch)
3459         {
3460           char_buf[0] = ch;
3461           cp = char_buf;
3462           idx2 = findidx (&cp);
3463 /*
3464           idx2 = table[ch];
3465 */
3466           if (idx2 == 0)
3467             /* This isn't a valid character.  */
3468             continue;
3469           /* Compare only if the length matches and the collation rule
3470              index is the same.  */
3471           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3472             {
3473               int cnt = 0;
3474
3475               while (cnt <= len &&
3476                      weights[(idx1 & 0xffffff) + 1 + cnt]
3477                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3478                 ++cnt;
3479
3480               if (cnt > len)
3481                 bitset_set (sbcset, ch);
3482             }
3483         }
3484       /* Check whether the array has enough space.  */
3485       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3486         {
3487           /* Not enough, realloc it.  */
3488           /* +1 in case of mbcset->nequiv_classes is 0.  */
3489           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3490           /* Use realloc since the array is NULL if *alloc == 0.  */
3491           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3492                                                    int32_t,
3493                                                    new_equiv_class_alloc);
3494           if (BE (new_equiv_classes == NULL, 0))
3495             return REG_ESPACE;
3496           mbcset->equiv_classes = new_equiv_classes;
3497           *equiv_class_alloc = new_equiv_class_alloc;
3498         }
3499       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3500     }
3501   else
3502 #endif /* _LIBC */
3503     {
3504       if (BE (strlen ((const char *) name) != 1, 0))
3505         return REG_ECOLLATE;
3506       bitset_set (sbcset, *name);
3507     }
3508   return REG_NOERROR;
3509 }
3510
3511   /* Helper function for parse_bracket_exp.
3512      Build the character class which is represented by NAME.
3513      The result are written to MBCSET and SBCSET.
3514      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3515      is a pointer argument sinse we may update it.  */
3516
3517 static reg_errcode_t
3518 #ifdef RE_ENABLE_I18N
3519 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3520                  re_charset_t *mbcset, Idx *char_class_alloc,
3521                  const unsigned char *class_name, reg_syntax_t syntax)
3522 #else /* not RE_ENABLE_I18N */
3523 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3524                  const unsigned char *class_name, reg_syntax_t syntax)
3525 #endif /* not RE_ENABLE_I18N */
3526 {
3527   int i;
3528   const char *name = (const char *) class_name;
3529
3530   /* In case of REG_ICASE "upper" and "lower" match the both of
3531      upper and lower cases.  */
3532   if ((syntax & RE_ICASE)
3533       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3534     name = "alpha";
3535
3536 #ifdef RE_ENABLE_I18N
3537   /* Check the space of the arrays.  */
3538   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3539     {
3540       /* Not enough, realloc it.  */
3541       /* +1 in case of mbcset->nchar_classes is 0.  */
3542       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3543       /* Use realloc since array is NULL if *alloc == 0.  */
3544       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3545                                                new_char_class_alloc);
3546       if (BE (new_char_classes == NULL, 0))
3547         return REG_ESPACE;
3548       mbcset->char_classes = new_char_classes;
3549       *char_class_alloc = new_char_class_alloc;
3550     }
3551   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3552 #endif /* RE_ENABLE_I18N */
3553
3554 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3555   do {                                          \
3556     if (BE (trans != NULL, 0))                  \
3557       {                                         \
3558         for (i = 0; i < SBC_MAX; ++i)           \
3559           if (ctype_func (i))                   \
3560             bitset_set (sbcset, trans[i]);      \
3561       }                                         \
3562     else                                        \
3563       {                                         \
3564         for (i = 0; i < SBC_MAX; ++i)           \
3565           if (ctype_func (i))                   \
3566             bitset_set (sbcset, i);             \
3567       }                                         \
3568   } while (0)
3569
3570   if (strcmp (name, "alnum") == 0)
3571     BUILD_CHARCLASS_LOOP (isalnum);
3572   else if (strcmp (name, "cntrl") == 0)
3573     BUILD_CHARCLASS_LOOP (iscntrl);
3574   else if (strcmp (name, "lower") == 0)
3575     BUILD_CHARCLASS_LOOP (islower);
3576   else if (strcmp (name, "space") == 0)
3577     BUILD_CHARCLASS_LOOP (isspace);
3578   else if (strcmp (name, "alpha") == 0)
3579     BUILD_CHARCLASS_LOOP (isalpha);
3580   else if (strcmp (name, "digit") == 0)
3581     BUILD_CHARCLASS_LOOP (isdigit);
3582   else if (strcmp (name, "print") == 0)
3583     BUILD_CHARCLASS_LOOP (isprint);
3584   else if (strcmp (name, "upper") == 0)
3585     BUILD_CHARCLASS_LOOP (isupper);
3586   else if (strcmp (name, "blank") == 0)
3587     BUILD_CHARCLASS_LOOP (isblank);
3588   else if (strcmp (name, "graph") == 0)
3589     BUILD_CHARCLASS_LOOP (isgraph);
3590   else if (strcmp (name, "punct") == 0)
3591     BUILD_CHARCLASS_LOOP (ispunct);
3592   else if (strcmp (name, "xdigit") == 0)
3593     BUILD_CHARCLASS_LOOP (isxdigit);
3594   else
3595     return REG_ECTYPE;
3596
3597   return REG_NOERROR;
3598 }
3599
3600 static bin_tree_t *
3601 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3602                     const unsigned char *class_name,
3603                     const unsigned char *extra, bool non_match,
3604                     reg_errcode_t *err)
3605 {
3606   re_bitset_ptr_t sbcset;
3607 #ifdef RE_ENABLE_I18N
3608   re_charset_t *mbcset;
3609   Idx alloc = 0;
3610 #endif /* not RE_ENABLE_I18N */
3611   reg_errcode_t ret;
3612   re_token_t br_token;
3613   bin_tree_t *tree;
3614
3615   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3616 #ifdef RE_ENABLE_I18N
3617   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3618 #endif /* RE_ENABLE_I18N */
3619
3620 #ifdef RE_ENABLE_I18N
3621   if (BE (sbcset == NULL || mbcset == NULL, 0))
3622 #else /* not RE_ENABLE_I18N */
3623   if (BE (sbcset == NULL, 0))
3624 #endif /* not RE_ENABLE_I18N */
3625     {
3626       *err = REG_ESPACE;
3627       return NULL;
3628     }
3629
3630   if (non_match)
3631     {
3632 #ifdef RE_ENABLE_I18N
3633       mbcset->non_match = 1;
3634 #endif /* not RE_ENABLE_I18N */
3635     }
3636
3637   /* We don't care the syntax in this case.  */
3638   ret = build_charclass (trans, sbcset,
3639 #ifdef RE_ENABLE_I18N
3640                          mbcset, &alloc,
3641 #endif /* RE_ENABLE_I18N */
3642                          class_name, 0);
3643
3644   if (BE (ret != REG_NOERROR, 0))
3645     {
3646       re_free (sbcset);
3647 #ifdef RE_ENABLE_I18N
3648       free_charset (mbcset);
3649 #endif /* RE_ENABLE_I18N */
3650       *err = ret;
3651       return NULL;
3652     }
3653   /* \w match '_' also.  */
3654   for (; *extra; extra++)
3655     bitset_set (sbcset, *extra);
3656
3657   /* If it is non-matching list.  */
3658   if (non_match)
3659     bitset_not (sbcset);
3660
3661 #ifdef RE_ENABLE_I18N
3662   /* Ensure only single byte characters are set.  */
3663   if (dfa->mb_cur_max > 1)
3664     bitset_mask (sbcset, dfa->sb_char);
3665 #endif
3666
3667   /* Build a tree for simple bracket.  */
3668   br_token.type = SIMPLE_BRACKET;
3669   br_token.opr.sbcset = sbcset;
3670   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3671   if (BE (tree == NULL, 0))
3672     goto build_word_op_espace;
3673
3674 #ifdef RE_ENABLE_I18N
3675   if (dfa->mb_cur_max > 1)
3676     {
3677       bin_tree_t *mbc_tree;
3678       /* Build a tree for complex bracket.  */
3679       br_token.type = COMPLEX_BRACKET;
3680       br_token.opr.mbcset = mbcset;
3681       dfa->has_mb_node = 1;
3682       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3683       if (BE (mbc_tree == NULL, 0))
3684         goto build_word_op_espace;
3685       /* Then join them by ALT node.  */
3686       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3687       if (BE (mbc_tree != NULL, 1))
3688         return tree;
3689     }
3690   else
3691     {
3692       free_charset (mbcset);
3693       return tree;
3694     }
3695 #else /* not RE_ENABLE_I18N */
3696   return tree;
3697 #endif /* not RE_ENABLE_I18N */
3698
3699  build_word_op_espace:
3700   re_free (sbcset);
3701 #ifdef RE_ENABLE_I18N
3702   free_charset (mbcset);
3703 #endif /* RE_ENABLE_I18N */
3704   *err = REG_ESPACE;
3705   return NULL;
3706 }
3707
3708 /* This is intended for the expressions like "a{1,3}".
3709    Fetch a number from `input', and return the number.
3710    Return REG_MISSING if the number field is empty like "{,1}".
3711    Return REG_ERROR if an error occurred.  */
3712
3713 static Idx
3714 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3715 {
3716   Idx num = REG_MISSING;
3717   unsigned char c;
3718   while (1)
3719     {
3720       fetch_token (token, input, syntax);
3721       c = token->opr.c;
3722       if (BE (token->type == END_OF_RE, 0))
3723         return REG_ERROR;
3724       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3725         break;
3726       num = ((token->type != CHARACTER || c < '0' || '9' < c
3727               || num == REG_ERROR)
3728              ? REG_ERROR
3729              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3730       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3731     }
3732   return num;
3733 }
3734 \f
3735 #ifdef RE_ENABLE_I18N
3736 static void
3737 free_charset (re_charset_t *cset)
3738 {
3739   re_free (cset->mbchars);
3740 # ifdef _LIBC
3741   re_free (cset->coll_syms);
3742   re_free (cset->equiv_classes);
3743   re_free (cset->range_starts);
3744   re_free (cset->range_ends);
3745 # endif
3746   re_free (cset->char_classes);
3747   re_free (cset);
3748 }
3749 #endif /* RE_ENABLE_I18N */
3750 \f
3751 /* Functions for binary tree operation.  */
3752
3753 /* Create a tree node.  */
3754
3755 static bin_tree_t *
3756 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3757              re_token_type_t type)
3758 {
3759   re_token_t t;
3760   t.type = type;
3761   return create_token_tree (dfa, left, right, &t);
3762 }
3763
3764 static bin_tree_t *
3765 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3766                    const re_token_t *token)
3767 {
3768   bin_tree_t *tree;
3769   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3770     {
3771       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3772
3773       if (storage == NULL)
3774         return NULL;
3775       storage->next = dfa->str_tree_storage;
3776       dfa->str_tree_storage = storage;
3777       dfa->str_tree_storage_idx = 0;
3778     }
3779   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3780
3781   tree->parent = NULL;
3782   tree->left = left;
3783   tree->right = right;
3784   tree->token = *token;
3785   tree->token.duplicated = 0;
3786   tree->token.opt_subexp = 0;
3787   tree->first = NULL;
3788   tree->next = NULL;
3789   tree->node_idx = REG_MISSING;
3790
3791   if (left != NULL)
3792     left->parent = tree;
3793   if (right != NULL)
3794     right->parent = tree;
3795   return tree;
3796 }
3797
3798 /* Mark the tree SRC as an optional subexpression.
3799    To be called from preorder or postorder.  */
3800
3801 static reg_errcode_t
3802 mark_opt_subexp (void *extra, bin_tree_t *node)
3803 {
3804   Idx idx = (Idx) (long) extra;
3805   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3806     node->token.opt_subexp = 1;
3807
3808   return REG_NOERROR;
3809 }
3810
3811 /* Free the allocated memory inside NODE. */
3812
3813 static void
3814 free_token (re_token_t *node)
3815 {
3816 #ifdef RE_ENABLE_I18N
3817   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3818     free_charset (node->opr.mbcset);
3819   else
3820 #endif /* RE_ENABLE_I18N */
3821     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3822       re_free (node->opr.sbcset);
3823 }
3824
3825 /* Worker function for tree walking.  Free the allocated memory inside NODE
3826    and its children. */
3827
3828 static reg_errcode_t
3829 free_tree (void *extra, bin_tree_t *node)
3830 {
3831   free_token (&node->token);
3832   return REG_NOERROR;
3833 }
3834
3835
3836 /* Duplicate the node SRC, and return new node.  This is a preorder
3837    visit similar to the one implemented by the generic visitor, but
3838    we need more infrastructure to maintain two parallel trees --- so,
3839    it's easier to duplicate.  */
3840
3841 static bin_tree_t *
3842 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3843 {
3844   const bin_tree_t *node;
3845   bin_tree_t *dup_root;
3846   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3847
3848   for (node = root; ; )
3849     {
3850       /* Create a new tree and link it back to the current parent.  */
3851       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3852       if (*p_new == NULL)
3853         return NULL;
3854       (*p_new)->parent = dup_node;
3855       (*p_new)->token.duplicated = 1;
3856       dup_node = *p_new;
3857
3858       /* Go to the left node, or up and to the right.  */
3859       if (node->left)
3860         {
3861           node = node->left;
3862           p_new = &dup_node->left;
3863         }
3864       else
3865         {
3866           const bin_tree_t *prev = NULL;
3867           while (node->right == prev || node->right == NULL)
3868             {
3869               prev = node;
3870               node = node->parent;
3871               dup_node = dup_node->parent;
3872               if (!node)
3873                 return dup_root;
3874             }
3875           node = node->right;
3876           p_new = &dup_node->right;
3877         }
3878     }
3879 }