1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
5 Software Foundation, Inc.
6 This file is part of the GNU C Library.
7 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3, or (at your option)
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License along
20 with this program; if not, write to the Free Software Foundation,
21 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
24 size_t length, reg_syntax_t syntax);
25 static void re_compile_fastmap_iter (regex_t *bufp,
26 const re_dfastate_t *init_state,
28 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 static void free_charset (re_charset_t *cset);
31 #endif /* RE_ENABLE_I18N */
32 static void free_workarea_compile (regex_t *preg);
33 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 static void optimize_utf8 (re_dfa_t *dfa);
37 static reg_errcode_t analyze (regex_t *preg);
38 static reg_errcode_t preorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t postorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
52 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
53 unsigned int constraint);
54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
58 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 static int peek_token (re_token_t *token, re_string_t *input,
61 reg_syntax_t syntax) internal_function;
62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
63 reg_syntax_t syntax, reg_errcode_t *err);
64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 Idx nest, reg_errcode_t *err);
76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
77 re_dfa_t *dfa, re_token_t *token,
78 reg_syntax_t syntax, reg_errcode_t *err);
79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
80 re_token_t *token, reg_syntax_t syntax,
82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_token_t *token, int token_len,
88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
92 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 Idx *equiv_class_alloc,
95 const unsigned char *name);
96 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
99 Idx *char_class_alloc,
100 const unsigned char *class_name,
101 reg_syntax_t syntax);
102 #else /* not RE_ENABLE_I18N */
103 static reg_errcode_t build_equiv_class (bitset_t sbcset,
104 const unsigned char *name);
105 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 const unsigned char *class_name,
108 reg_syntax_t syntax);
109 #endif /* not RE_ENABLE_I18N */
110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
111 RE_TRANSLATE_TYPE trans,
112 const unsigned char *class_name,
113 const unsigned char *extra,
114 bool non_match, reg_errcode_t *err);
115 static bin_tree_t *create_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 re_token_type_t type);
118 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
119 bin_tree_t *left, bin_tree_t *right,
120 const re_token_t *token);
121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
122 static void free_token (re_token_t *node);
123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126 /* This table gives an error message for each of the error codes listed
127 in regex.h. Obviously the order here has to be same as there.
128 POSIX doesn't require that we do anything for REG_NOERROR,
129 but why not be nice? */
131 static const char __re_error_msgid[] =
133 #define REG_NOERROR_IDX 0
134 gettext_noop ("Success") /* REG_NOERROR */
136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
137 gettext_noop ("No match") /* REG_NOMATCH */
139 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
140 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
143 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
146 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
149 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
152 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
155 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
157 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
158 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
161 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
164 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
167 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
170 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
173 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
176 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
179 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
182 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
185 static const size_t __re_error_msgid_idx[] =
206 /* Entry points for GNU code. */
208 /* re_compile_pattern is the GNU regular expression compiler: it
209 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
210 Returns 0 if the pattern was valid, otherwise an error string.
212 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
213 are set in BUFP on entry. */
217 re_compile_pattern (pattern, length, bufp)
220 struct re_pattern_buffer *bufp;
221 #else /* size_t might promote */
223 re_compile_pattern (const char *pattern, size_t length,
224 struct re_pattern_buffer *bufp)
229 /* And GNU code determines whether or not to get register information
230 by passing null for the REGS argument to re_match, etc., not by
231 setting no_sub, unless RE_NO_SUB is set. */
232 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
234 /* Match anchors at newline. */
235 bufp->newline_anchor = 1;
237 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
241 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
244 weak_alias (__re_compile_pattern, re_compile_pattern)
247 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
248 also be assigned to arbitrarily: each pattern buffer stores its own
249 syntax, so it can be changed between regex compilations. */
250 /* This has no initializer because initialized variables in Emacs
251 become read-only after dumping. */
252 reg_syntax_t re_syntax_options;
255 /* Specify the precise syntax of regexps for compilation. This provides
256 for compatibility for various utilities which historically have
257 different, incompatible syntaxes.
259 The argument SYNTAX is a bit mask comprised of the various bits
260 defined in regex.h. We return the old syntax. */
263 re_set_syntax (syntax)
266 reg_syntax_t ret = re_syntax_options;
268 re_syntax_options = syntax;
272 weak_alias (__re_set_syntax, re_set_syntax)
276 re_compile_fastmap (bufp)
277 struct re_pattern_buffer *bufp;
279 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
280 char *fastmap = bufp->fastmap;
282 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
283 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
284 if (dfa->init_state != dfa->init_state_word)
285 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
286 if (dfa->init_state != dfa->init_state_nl)
287 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
288 if (dfa->init_state != dfa->init_state_begbuf)
289 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
290 bufp->fastmap_accurate = 1;
294 weak_alias (__re_compile_fastmap, re_compile_fastmap)
298 __attribute ((always_inline))
299 re_set_fastmap (char *fastmap, bool icase, int ch)
303 fastmap[tolower (ch)] = 1;
306 /* Helper function for re_compile_fastmap.
307 Compile fastmap for the initial_state INIT_STATE. */
310 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
313 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
315 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
316 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
318 Idx node = init_state->nodes.elems[node_cnt];
319 re_token_type_t type = dfa->nodes[node].type;
321 if (type == CHARACTER)
323 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
324 #ifdef RE_ENABLE_I18N
325 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
327 unsigned char buf[MB_LEN_MAX];
333 *p++ = dfa->nodes[node].opr.c;
334 while (++node < dfa->nodes_len
335 && dfa->nodes[node].type == CHARACTER
336 && dfa->nodes[node].mb_partial)
337 *p++ = dfa->nodes[node].opr.c;
338 memset (&state, '\0', sizeof (state));
339 if (__mbrtowc (&wc, (const char *) buf, p - buf,
341 && (__wcrtomb ((char *) buf, towlower (wc), &state)
343 re_set_fastmap (fastmap, false, buf[0]);
347 else if (type == SIMPLE_BRACKET)
350 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
353 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
354 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
355 if (w & ((bitset_word_t) 1 << j))
356 re_set_fastmap (fastmap, icase, ch);
359 #ifdef RE_ENABLE_I18N
360 else if (type == COMPLEX_BRACKET)
362 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
366 /* See if we have to try all bytes which start multiple collation
368 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
369 collation element, and don't catch 'b' since 'b' is
370 the only collation element which starts from 'b' (and
371 it is caught by SIMPLE_BRACKET). */
372 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
373 && (cset->ncoll_syms || cset->nranges))
375 const int32_t *table = (const int32_t *)
376 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
377 for (i = 0; i < SBC_MAX; ++i)
379 re_set_fastmap (fastmap, icase, i);
383 /* See if we have to start the match at all multibyte characters,
384 i.e. where we would not find an invalid sequence. This only
385 applies to multibyte character sets; for single byte character
386 sets, the SIMPLE_BRACKET again suffices. */
387 if (dfa->mb_cur_max > 1
388 && (cset->nchar_classes || cset->non_match || cset->nranges
390 || cset->nequiv_classes
398 memset (&mbs, 0, sizeof (mbs));
399 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
400 re_set_fastmap (fastmap, false, (int) c);
407 /* ... Else catch all bytes which can start the mbchars. */
408 for (i = 0; i < cset->nmbchars; ++i)
412 memset (&state, '\0', sizeof (state));
413 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
414 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
415 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
417 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
419 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
424 #endif /* RE_ENABLE_I18N */
425 else if (type == OP_PERIOD
426 #ifdef RE_ENABLE_I18N
427 || type == OP_UTF8_PERIOD
428 #endif /* RE_ENABLE_I18N */
429 || type == END_OF_RE)
431 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
432 if (type == END_OF_RE)
433 bufp->can_be_null = 1;
439 /* Entry point for POSIX code. */
440 /* regcomp takes a regular expression as a string and compiles it.
442 PREG is a regex_t *. We do not expect any fields to be initialized,
443 since POSIX says we shouldn't. Thus, we set
445 `buffer' to the compiled pattern;
446 `used' to the length of the compiled pattern;
447 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
448 REG_EXTENDED bit in CFLAGS is set; otherwise, to
449 RE_SYNTAX_POSIX_BASIC;
450 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
451 `fastmap' to an allocated space for the fastmap;
452 `fastmap_accurate' to zero;
453 `re_nsub' to the number of subexpressions in PATTERN.
455 PATTERN is the address of the pattern string.
457 CFLAGS is a series of bits which affect compilation.
459 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
460 use POSIX basic syntax.
462 If REG_NEWLINE is set, then . and [^...] don't match newline.
463 Also, regexec will try a match beginning after every newline.
465 If REG_ICASE is set, then we considers upper- and lowercase
466 versions of letters to be equivalent when matching.
468 If REG_NOSUB is set, then when PREG is passed to regexec, that
469 routine will report only success or failure, and nothing about the
472 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
473 the return codes and their meanings.) */
476 regcomp (preg, pattern, cflags)
477 regex_t *_Restrict_ preg;
478 const char *_Restrict_ pattern;
482 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
483 : RE_SYNTAX_POSIX_BASIC);
489 /* Try to allocate space for the fastmap. */
490 preg->fastmap = re_malloc (char, SBC_MAX);
491 if (BE (preg->fastmap == NULL, 0))
494 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
496 /* If REG_NEWLINE is set, newlines are treated differently. */
497 if (cflags & REG_NEWLINE)
498 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
499 syntax &= ~RE_DOT_NEWLINE;
500 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
501 /* It also changes the matching behavior. */
502 preg->newline_anchor = 1;
505 preg->newline_anchor = 0;
506 preg->no_sub = !!(cflags & REG_NOSUB);
507 preg->translate = NULL;
509 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
511 /* POSIX doesn't distinguish between an unmatched open-group and an
512 unmatched close-group: both are REG_EPAREN. */
513 if (ret == REG_ERPAREN)
516 /* We have already checked preg->fastmap != NULL. */
517 if (BE (ret == REG_NOERROR, 1))
518 /* Compute the fastmap now, since regexec cannot modify the pattern
519 buffer. This function never fails in this implementation. */
520 (void) re_compile_fastmap (preg);
523 /* Some error occurred while compiling the expression. */
524 re_free (preg->fastmap);
525 preg->fastmap = NULL;
531 weak_alias (__regcomp, regcomp)
534 /* Returns a message corresponding to an error code, ERRCODE, returned
535 from either regcomp or regexec. We don't use PREG here. */
539 regerror (errcode, preg, errbuf, errbuf_size)
541 const regex_t *_Restrict_ preg;
542 char *_Restrict_ errbuf;
544 #else /* size_t might promote */
546 regerror (int errcode, const regex_t *_Restrict_ preg,
547 char *_Restrict_ errbuf, size_t errbuf_size)
554 || errcode >= (int) (sizeof (__re_error_msgid_idx)
555 / sizeof (__re_error_msgid_idx[0])), 0))
556 /* Only error codes returned by the rest of the code should be passed
557 to this routine. If we are given anything else, or if other regex
558 code generates an invalid error code, then the program has a bug.
559 Dump core so we can fix it. */
562 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
564 msg_size = strlen (msg) + 1; /* Includes the null. */
566 if (BE (errbuf_size != 0, 1))
568 size_t cpy_size = msg_size;
569 if (BE (msg_size > errbuf_size, 0))
571 cpy_size = errbuf_size - 1;
572 errbuf[cpy_size] = '\0';
574 memcpy (errbuf, msg, cpy_size);
580 weak_alias (__regerror, regerror)
584 #ifdef RE_ENABLE_I18N
585 /* This static array is used for the map to single-byte characters when
586 UTF-8 is used. Otherwise we would allocate memory just to initialize
587 it the same all the time. UTF-8 is the preferred encoding so this is
588 a worthwhile optimization. */
589 static const bitset_t utf8_sb_map =
591 /* Set the first 128 bits. */
592 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
593 # error "bitset_word_t is narrower than 32 bits"
594 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
596 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
597 BITSET_WORD_MAX, BITSET_WORD_MAX,
598 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
602 >> (SBC_MAX % BITSET_WORD_BITS == 0
604 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
610 free_dfa_content (re_dfa_t *dfa)
615 for (i = 0; i < dfa->nodes_len; ++i)
616 free_token (dfa->nodes + i);
617 re_free (dfa->nexts);
618 for (i = 0; i < dfa->nodes_len; ++i)
620 if (dfa->eclosures != NULL)
621 re_node_set_free (dfa->eclosures + i);
622 if (dfa->inveclosures != NULL)
623 re_node_set_free (dfa->inveclosures + i);
624 if (dfa->edests != NULL)
625 re_node_set_free (dfa->edests + i);
627 re_free (dfa->edests);
628 re_free (dfa->eclosures);
629 re_free (dfa->inveclosures);
630 re_free (dfa->nodes);
632 if (dfa->state_table)
633 for (i = 0; i <= dfa->state_hash_mask; ++i)
635 struct re_state_table_entry *entry = dfa->state_table + i;
636 for (j = 0; j < entry->num; ++j)
638 re_dfastate_t *state = entry->array[j];
641 re_free (entry->array);
643 re_free (dfa->state_table);
644 #ifdef RE_ENABLE_I18N
645 if (dfa->sb_char != utf8_sb_map)
646 re_free (dfa->sb_char);
648 re_free (dfa->subexp_map);
650 re_free (dfa->re_str);
657 /* Free dynamically allocated space used by PREG. */
663 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
664 if (BE (dfa != NULL, 1))
665 free_dfa_content (dfa);
669 re_free (preg->fastmap);
670 preg->fastmap = NULL;
672 re_free (preg->translate);
673 preg->translate = NULL;
676 weak_alias (__regfree, regfree)
679 /* Entry points compatible with 4.2 BSD regex library. We don't define
680 them unless specifically requested. */
682 #if defined _REGEX_RE_COMP || defined _LIBC
684 /* BSD has one and only one pattern buffer. */
685 static struct re_pattern_buffer re_comp_buf;
689 /* Make these definitions weak in libc, so POSIX programs can redefine
690 these names if they don't use our functions, and still use
691 regcomp/regexec above without link errors. */
702 if (!re_comp_buf.buffer)
703 return gettext ("No previous regular expression");
707 if (re_comp_buf.buffer)
709 fastmap = re_comp_buf.fastmap;
710 re_comp_buf.fastmap = NULL;
711 __regfree (&re_comp_buf);
712 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
713 re_comp_buf.fastmap = fastmap;
716 if (re_comp_buf.fastmap == NULL)
718 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
719 if (re_comp_buf.fastmap == NULL)
720 return (char *) gettext (__re_error_msgid
721 + __re_error_msgid_idx[(int) REG_ESPACE]);
724 /* Since `re_exec' always passes NULL for the `regs' argument, we
725 don't need to initialize the pattern buffer fields which affect it. */
727 /* Match anchors at newlines. */
728 re_comp_buf.newline_anchor = 1;
730 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
735 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
736 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
740 libc_freeres_fn (free_mem)
742 __regfree (&re_comp_buf);
746 #endif /* _REGEX_RE_COMP */
748 /* Internal entry point.
749 Compile the regular expression PATTERN, whose length is LENGTH.
750 SYNTAX indicate regular expression's syntax. */
753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
756 reg_errcode_t err = REG_NOERROR;
760 /* Initialize the pattern buffer. */
761 preg->fastmap_accurate = 0;
762 preg->syntax = syntax;
763 preg->not_bol = preg->not_eol = 0;
766 preg->can_be_null = 0;
767 preg->regs_allocated = REGS_UNALLOCATED;
769 /* Initialize the dfa. */
770 dfa = (re_dfa_t *) preg->buffer;
771 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
773 /* If zero allocated, but buffer is non-null, try to realloc
774 enough space. This loses if buffer's address is bogus, but
775 that is the user's responsibility. If ->buffer is NULL this
776 is a simple allocation. */
777 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
780 preg->allocated = sizeof (re_dfa_t);
781 preg->buffer = (unsigned char *) dfa;
783 preg->used = sizeof (re_dfa_t);
785 err = init_dfa (dfa, length);
786 if (BE (err != REG_NOERROR, 0))
788 free_dfa_content (dfa);
794 /* Note: length+1 will not overflow since it is checked in init_dfa. */
795 dfa->re_str = re_malloc (char, length + 1);
796 strncpy (dfa->re_str, pattern, length + 1);
799 __libc_lock_init (dfa->lock);
801 err = re_string_construct (®exp, pattern, length, preg->translate,
802 (syntax & RE_ICASE) != 0, dfa);
803 if (BE (err != REG_NOERROR, 0))
805 re_compile_internal_free_return:
806 free_workarea_compile (preg);
807 re_string_destruct (®exp);
808 free_dfa_content (dfa);
814 /* Parse the regular expression, and build a structure tree. */
816 dfa->str_tree = parse (®exp, preg, syntax, &err);
817 if (BE (dfa->str_tree == NULL, 0))
818 goto re_compile_internal_free_return;
820 /* Analyze the tree and create the nfa. */
821 err = analyze (preg);
822 if (BE (err != REG_NOERROR, 0))
823 goto re_compile_internal_free_return;
825 #ifdef RE_ENABLE_I18N
826 /* If possible, do searching in single byte encoding to speed things up. */
827 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
831 /* Then create the initial state of the dfa. */
832 err = create_initial_state (dfa);
834 /* Release work areas. */
835 free_workarea_compile (preg);
836 re_string_destruct (®exp);
838 if (BE (err != REG_NOERROR, 0))
840 free_dfa_content (dfa);
848 /* Initialize DFA. We use the length of the regular expression PAT_LEN
849 as the initial length of some arrays. */
852 init_dfa (re_dfa_t *dfa, size_t pat_len)
854 __re_size_t table_size;
858 #ifdef RE_ENABLE_I18N
859 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
861 size_t max_i18n_object_size = 0;
863 size_t max_object_size =
864 MAX (sizeof (struct re_state_table_entry),
865 MAX (sizeof (re_token_t),
866 MAX (sizeof (re_node_set),
867 MAX (sizeof (regmatch_t),
868 max_i18n_object_size))));
870 memset (dfa, '\0', sizeof (re_dfa_t));
872 /* Force allocation of str_tree_storage the first time. */
873 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
875 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
876 calculation below, and for similar doubling calculations
877 elsewhere. And it's <= rather than <, because some of the
878 doubling calculations add 1 afterwards. */
879 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
882 dfa->nodes_alloc = pat_len + 1;
883 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
885 /* table_size = 2 ^ ceil(log pat_len) */
886 for (table_size = 1; ; table_size <<= 1)
887 if (table_size > pat_len)
890 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
891 dfa->state_hash_mask = table_size - 1;
893 dfa->mb_cur_max = MB_CUR_MAX;
895 if (dfa->mb_cur_max == 6
896 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
898 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
901 codeset_name = nl_langinfo (CODESET);
902 if (strcasecmp (codeset_name, "UTF-8") == 0
903 || strcasecmp (codeset_name, "UTF8") == 0)
906 /* We check exhaustively in the loop below if this charset is a
907 superset of ASCII. */
908 dfa->map_notascii = 0;
911 #ifdef RE_ENABLE_I18N
912 if (dfa->mb_cur_max > 1)
915 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
920 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
921 if (BE (dfa->sb_char == NULL, 0))
924 /* Set the bits corresponding to single byte chars. */
925 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
926 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
928 wint_t wch = __btowc (ch);
930 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
932 if (isascii (ch) && wch != ch)
933 dfa->map_notascii = 1;
940 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
945 /* Initialize WORD_CHAR table, which indicate which character is
946 "word". In this case "word" means that it is the word construction
947 character used by some operators like "\<", "\>", etc. */
951 init_word_char (re_dfa_t *dfa)
954 dfa->word_ops_used = 1;
955 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
956 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
957 if (isalnum (ch) || ch == '_')
958 dfa->word_char[i] |= (bitset_word_t) 1 << j;
961 /* Free the work area which are only used while compiling. */
964 free_workarea_compile (regex_t *preg)
966 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
967 bin_tree_storage_t *storage, *next;
968 for (storage = dfa->str_tree_storage; storage; storage = next)
970 next = storage->next;
973 dfa->str_tree_storage = NULL;
974 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
975 dfa->str_tree = NULL;
976 re_free (dfa->org_indices);
977 dfa->org_indices = NULL;
980 /* Create initial states for all contexts. */
983 create_initial_state (re_dfa_t *dfa)
987 re_node_set init_nodes;
989 /* Initial states have the epsilon closure of the node which is
990 the first node of the regular expression. */
991 first = dfa->str_tree->first->node_idx;
992 dfa->init_node = first;
993 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
994 if (BE (err != REG_NOERROR, 0))
997 /* The back-references which are in initial states can epsilon transit,
998 since in this case all of the subexpressions can be null.
999 Then we add epsilon closures of the nodes which are the next nodes of
1000 the back-references. */
1001 if (dfa->nbackref > 0)
1002 for (i = 0; i < init_nodes.nelem; ++i)
1004 Idx node_idx = init_nodes.elems[i];
1005 re_token_type_t type = dfa->nodes[node_idx].type;
1008 if (type != OP_BACK_REF)
1010 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1012 re_token_t *clexp_node;
1013 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1014 if (clexp_node->type == OP_CLOSE_SUBEXP
1015 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1018 if (clexp_idx == init_nodes.nelem)
1021 if (type == OP_BACK_REF)
1023 Idx dest_idx = dfa->edests[node_idx].elems[0];
1024 if (!re_node_set_contains (&init_nodes, dest_idx))
1026 reg_errcode_t merge_err
1027 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1028 if (merge_err != REG_NOERROR)
1035 /* It must be the first time to invoke acquire_state. */
1036 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1037 /* We don't check ERR here, since the initial state must not be NULL. */
1038 if (BE (dfa->init_state == NULL, 0))
1040 if (dfa->init_state->has_constraint)
1042 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1044 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1046 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1050 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1051 || dfa->init_state_begbuf == NULL, 0))
1055 dfa->init_state_word = dfa->init_state_nl
1056 = dfa->init_state_begbuf = dfa->init_state;
1058 re_node_set_free (&init_nodes);
1062 #ifdef RE_ENABLE_I18N
1063 /* If it is possible to do searching in single byte encoding instead of UTF-8
1064 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1065 DFA nodes where needed. */
1068 optimize_utf8 (re_dfa_t *dfa)
1072 bool mb_chars = false;
1073 bool has_period = false;
1075 for (node = 0; node < dfa->nodes_len; ++node)
1076 switch (dfa->nodes[node].type)
1079 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1083 switch (dfa->nodes[node].opr.ctx_type)
1091 /* Word anchors etc. cannot be handled. It's okay to test
1092 opr.ctx_type since constraints (for all DFA nodes) are
1093 created by ORing one or more opr.ctx_type values. */
1103 case OP_DUP_ASTERISK:
1104 case OP_OPEN_SUBEXP:
1105 case OP_CLOSE_SUBEXP:
1107 case COMPLEX_BRACKET:
1109 case SIMPLE_BRACKET:
1110 /* Just double check. */
1112 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1114 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1115 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1117 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1127 if (mb_chars || has_period)
1128 for (node = 0; node < dfa->nodes_len; ++node)
1130 if (dfa->nodes[node].type == CHARACTER
1131 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1132 dfa->nodes[node].mb_partial = 0;
1133 else if (dfa->nodes[node].type == OP_PERIOD)
1134 dfa->nodes[node].type = OP_UTF8_PERIOD;
1137 /* The search can be in single byte locale. */
1138 dfa->mb_cur_max = 1;
1140 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1144 /* Analyze the structure tree, and calculate "first", "next", "edest",
1145 "eclosure", and "inveclosure". */
1147 static reg_errcode_t
1148 analyze (regex_t *preg)
1150 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1153 /* Allocate arrays. */
1154 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1155 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1156 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1157 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1158 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1159 || dfa->eclosures == NULL, 0))
1162 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1163 if (dfa->subexp_map != NULL)
1166 for (i = 0; i < preg->re_nsub; i++)
1167 dfa->subexp_map[i] = i;
1168 preorder (dfa->str_tree, optimize_subexps, dfa);
1169 for (i = 0; i < preg->re_nsub; i++)
1170 if (dfa->subexp_map[i] != i)
1172 if (i == preg->re_nsub)
1174 free (dfa->subexp_map);
1175 dfa->subexp_map = NULL;
1179 ret = postorder (dfa->str_tree, lower_subexps, preg);
1180 if (BE (ret != REG_NOERROR, 0))
1182 ret = postorder (dfa->str_tree, calc_first, dfa);
1183 if (BE (ret != REG_NOERROR, 0))
1185 preorder (dfa->str_tree, calc_next, dfa);
1186 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1187 if (BE (ret != REG_NOERROR, 0))
1189 ret = calc_eclosure (dfa);
1190 if (BE (ret != REG_NOERROR, 0))
1193 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1194 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1195 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1198 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1199 if (BE (dfa->inveclosures == NULL, 0))
1201 ret = calc_inveclosure (dfa);
1207 /* Our parse trees are very unbalanced, so we cannot use a stack to
1208 implement parse tree visits. Instead, we use parent pointers and
1209 some hairy code in these two functions. */
1210 static reg_errcode_t
1211 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1214 bin_tree_t *node, *prev;
1216 for (node = root; ; )
1218 /* Descend down the tree, preferably to the left (or to the right
1219 if that's the only child). */
1220 while (node->left || node->right)
1228 reg_errcode_t err = fn (extra, node);
1229 if (BE (err != REG_NOERROR, 0))
1231 if (node->parent == NULL)
1234 node = node->parent;
1236 /* Go up while we have a node that is reached from the right. */
1237 while (node->right == prev || node->right == NULL);
1242 static reg_errcode_t
1243 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1248 for (node = root; ; )
1250 reg_errcode_t err = fn (extra, node);
1251 if (BE (err != REG_NOERROR, 0))
1254 /* Go to the left node, or up and to the right. */
1259 bin_tree_t *prev = NULL;
1260 while (node->right == prev || node->right == NULL)
1263 node = node->parent;
1272 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1273 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1274 backreferences as well. Requires a preorder visit. */
1275 static reg_errcode_t
1276 optimize_subexps (void *extra, bin_tree_t *node)
1278 re_dfa_t *dfa = (re_dfa_t *) extra;
1280 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1282 int idx = node->token.opr.idx;
1283 node->token.opr.idx = dfa->subexp_map[idx];
1284 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1287 else if (node->token.type == SUBEXP
1288 && node->left && node->left->token.type == SUBEXP)
1290 Idx other_idx = node->left->token.opr.idx;
1292 node->left = node->left->left;
1294 node->left->parent = node;
1296 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1297 if (other_idx < BITSET_WORD_BITS)
1298 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1304 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1305 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1306 static reg_errcode_t
1307 lower_subexps (void *extra, bin_tree_t *node)
1309 regex_t *preg = (regex_t *) extra;
1310 reg_errcode_t err = REG_NOERROR;
1312 if (node->left && node->left->token.type == SUBEXP)
1314 node->left = lower_subexp (&err, preg, node->left);
1316 node->left->parent = node;
1318 if (node->right && node->right->token.type == SUBEXP)
1320 node->right = lower_subexp (&err, preg, node->right);
1322 node->right->parent = node;
1329 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1331 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1332 bin_tree_t *body = node->left;
1333 bin_tree_t *op, *cls, *tree1, *tree;
1336 /* We do not optimize empty subexpressions, because otherwise we may
1337 have bad CONCAT nodes with NULL children. This is obviously not
1338 very common, so we do not lose much. An example that triggers
1339 this case is the sed "script" /\(\)/x. */
1340 && node->left != NULL
1341 && (node->token.opr.idx >= BITSET_WORD_BITS
1342 || !(dfa->used_bkref_map
1343 & ((bitset_word_t) 1 << node->token.opr.idx))))
1346 /* Convert the SUBEXP node to the concatenation of an
1347 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1348 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1349 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1350 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1351 tree = create_tree (dfa, op, tree1, CONCAT);
1352 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1358 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1359 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1363 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1364 nodes. Requires a postorder visit. */
1365 static reg_errcode_t
1366 calc_first (void *extra, bin_tree_t *node)
1368 re_dfa_t *dfa = (re_dfa_t *) extra;
1369 if (node->token.type == CONCAT)
1371 node->first = node->left->first;
1372 node->node_idx = node->left->node_idx;
1377 node->node_idx = re_dfa_add_node (dfa, node->token);
1378 if (BE (node->node_idx == REG_MISSING, 0))
1380 if (node->token.type == ANCHOR)
1381 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1386 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1387 static reg_errcode_t
1388 calc_next (void *extra, bin_tree_t *node)
1390 switch (node->token.type)
1392 case OP_DUP_ASTERISK:
1393 node->left->next = node;
1396 node->left->next = node->right->first;
1397 node->right->next = node->next;
1401 node->left->next = node->next;
1403 node->right->next = node->next;
1409 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1410 static reg_errcode_t
1411 link_nfa_nodes (void *extra, bin_tree_t *node)
1413 re_dfa_t *dfa = (re_dfa_t *) extra;
1414 Idx idx = node->node_idx;
1415 reg_errcode_t err = REG_NOERROR;
1417 switch (node->token.type)
1423 assert (node->next == NULL);
1426 case OP_DUP_ASTERISK:
1430 dfa->has_plural_match = 1;
1431 if (node->left != NULL)
1432 left = node->left->first->node_idx;
1434 left = node->next->node_idx;
1435 if (node->right != NULL)
1436 right = node->right->first->node_idx;
1438 right = node->next->node_idx;
1439 assert (REG_VALID_INDEX (left));
1440 assert (REG_VALID_INDEX (right));
1441 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1446 case OP_OPEN_SUBEXP:
1447 case OP_CLOSE_SUBEXP:
1448 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1452 dfa->nexts[idx] = node->next->node_idx;
1453 if (node->token.type == OP_BACK_REF)
1454 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1458 assert (!IS_EPSILON_NODE (node->token.type));
1459 dfa->nexts[idx] = node->next->node_idx;
1466 /* Duplicate the epsilon closure of the node ROOT_NODE.
1467 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1468 to their own constraint. */
1470 static reg_errcode_t
1472 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1473 Idx root_node, unsigned int init_constraint)
1475 Idx org_node, clone_node;
1477 unsigned int constraint = init_constraint;
1478 for (org_node = top_org_node, clone_node = top_clone_node;;)
1480 Idx org_dest, clone_dest;
1481 if (dfa->nodes[org_node].type == OP_BACK_REF)
1483 /* If the back reference epsilon-transit, its destination must
1484 also have the constraint. Then duplicate the epsilon closure
1485 of the destination of the back reference, and store it in
1486 edests of the back reference. */
1487 org_dest = dfa->nexts[org_node];
1488 re_node_set_empty (dfa->edests + clone_node);
1489 clone_dest = duplicate_node (dfa, org_dest, constraint);
1490 if (BE (clone_dest == REG_MISSING, 0))
1492 dfa->nexts[clone_node] = dfa->nexts[org_node];
1493 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1497 else if (dfa->edests[org_node].nelem == 0)
1499 /* In case of the node can't epsilon-transit, don't duplicate the
1500 destination and store the original destination as the
1501 destination of the node. */
1502 dfa->nexts[clone_node] = dfa->nexts[org_node];
1505 else if (dfa->edests[org_node].nelem == 1)
1507 /* In case of the node can epsilon-transit, and it has only one
1509 org_dest = dfa->edests[org_node].elems[0];
1510 re_node_set_empty (dfa->edests + clone_node);
1511 /* If the node is root_node itself, it means the epsilon closure
1512 has a loop. Then tie it to the destination of the root_node. */
1513 if (org_node == root_node && clone_node != org_node)
1515 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1520 /* In case the node has another constraint, append it. */
1521 constraint |= dfa->nodes[org_node].constraint;
1522 clone_dest = duplicate_node (dfa, org_dest, constraint);
1523 if (BE (clone_dest == REG_MISSING, 0))
1525 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529 else /* dfa->edests[org_node].nelem == 2 */
1531 /* In case of the node can epsilon-transit, and it has two
1532 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1533 org_dest = dfa->edests[org_node].elems[0];
1534 re_node_set_empty (dfa->edests + clone_node);
1535 /* Search for a duplicated node which satisfies the constraint. */
1536 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1537 if (clone_dest == REG_MISSING)
1539 /* There is no such duplicated node, create a new one. */
1541 clone_dest = duplicate_node (dfa, org_dest, constraint);
1542 if (BE (clone_dest == REG_MISSING, 0))
1544 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1547 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1548 root_node, constraint);
1549 if (BE (err != REG_NOERROR, 0))
1554 /* There is a duplicated node which satisfies the constraint,
1555 use it to avoid infinite loop. */
1556 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1561 org_dest = dfa->edests[org_node].elems[1];
1562 clone_dest = duplicate_node (dfa, org_dest, constraint);
1563 if (BE (clone_dest == REG_MISSING, 0))
1565 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1569 org_node = org_dest;
1570 clone_node = clone_dest;
1575 /* Search for a node which is duplicated from the node ORG_NODE, and
1576 satisfies the constraint CONSTRAINT. */
1579 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1580 unsigned int constraint)
1583 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1585 if (org_node == dfa->org_indices[idx]
1586 && constraint == dfa->nodes[idx].constraint)
1587 return idx; /* Found. */
1589 return REG_MISSING; /* Not found. */
1592 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1593 Return the index of the new node, or REG_MISSING if insufficient storage is
1597 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1599 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1600 if (BE (dup_idx != REG_MISSING, 1))
1602 dfa->nodes[dup_idx].constraint = constraint;
1603 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1604 dfa->nodes[dup_idx].duplicated = 1;
1606 /* Store the index of the original node. */
1607 dfa->org_indices[dup_idx] = org_idx;
1612 static reg_errcode_t
1613 calc_inveclosure (re_dfa_t *dfa)
1617 for (idx = 0; idx < dfa->nodes_len; ++idx)
1618 re_node_set_init_empty (dfa->inveclosures + idx);
1620 for (src = 0; src < dfa->nodes_len; ++src)
1622 Idx *elems = dfa->eclosures[src].elems;
1623 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1625 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1634 /* Calculate "eclosure" for all the node in DFA. */
1636 static reg_errcode_t
1637 calc_eclosure (re_dfa_t *dfa)
1642 assert (dfa->nodes_len > 0);
1645 /* For each nodes, calculate epsilon closure. */
1646 for (node_idx = 0; ; ++node_idx)
1649 re_node_set eclosure_elem;
1650 if (node_idx == dfa->nodes_len)
1659 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1662 /* If we have already calculated, skip it. */
1663 if (dfa->eclosures[node_idx].nelem != 0)
1665 /* Calculate epsilon closure of `node_idx'. */
1666 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1667 if (BE (err != REG_NOERROR, 0))
1670 if (dfa->eclosures[node_idx].nelem == 0)
1673 re_node_set_free (&eclosure_elem);
1679 /* Calculate epsilon closure of NODE. */
1681 static reg_errcode_t
1682 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1686 re_node_set eclosure;
1688 bool incomplete = false;
1689 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1690 if (BE (err != REG_NOERROR, 0))
1693 /* This indicates that we are calculating this node now.
1694 We reference this value to avoid infinite loop. */
1695 dfa->eclosures[node].nelem = REG_MISSING;
1697 /* If the current node has constraints, duplicate all nodes
1698 since they must inherit the constraints. */
1699 if (dfa->nodes[node].constraint
1700 && dfa->edests[node].nelem
1701 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1703 err = duplicate_node_closure (dfa, node, node, node,
1704 dfa->nodes[node].constraint);
1705 if (BE (err != REG_NOERROR, 0))
1709 /* Expand each epsilon destination nodes. */
1710 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1711 for (i = 0; i < dfa->edests[node].nelem; ++i)
1713 re_node_set eclosure_elem;
1714 Idx edest = dfa->edests[node].elems[i];
1715 /* If calculating the epsilon closure of `edest' is in progress,
1716 return intermediate result. */
1717 if (dfa->eclosures[edest].nelem == REG_MISSING)
1722 /* If we haven't calculated the epsilon closure of `edest' yet,
1723 calculate now. Otherwise use calculated epsilon closure. */
1724 if (dfa->eclosures[edest].nelem == 0)
1726 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1727 if (BE (err != REG_NOERROR, 0))
1731 eclosure_elem = dfa->eclosures[edest];
1732 /* Merge the epsilon closure of `edest'. */
1733 err = re_node_set_merge (&eclosure, &eclosure_elem);
1734 if (BE (err != REG_NOERROR, 0))
1736 /* If the epsilon closure of `edest' is incomplete,
1737 the epsilon closure of this node is also incomplete. */
1738 if (dfa->eclosures[edest].nelem == 0)
1741 re_node_set_free (&eclosure_elem);
1745 /* An epsilon closure includes itself. */
1746 ok = re_node_set_insert (&eclosure, node);
1749 if (incomplete && !root)
1750 dfa->eclosures[node].nelem = 0;
1752 dfa->eclosures[node] = eclosure;
1753 *new_set = eclosure;
1757 /* Functions for token which are used in the parser. */
1759 /* Fetch a token from INPUT.
1760 We must not use this function inside bracket expressions. */
1764 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1766 re_string_skip_bytes (input, peek_token (result, input, syntax));
1769 /* Peek a token from INPUT, and return the length of the token.
1770 We must not use this function inside bracket expressions. */
1774 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1778 if (re_string_eoi (input))
1780 token->type = END_OF_RE;
1784 c = re_string_peek_byte (input, 0);
1787 token->word_char = 0;
1788 #ifdef RE_ENABLE_I18N
1789 token->mb_partial = 0;
1790 if (input->mb_cur_max > 1 &&
1791 !re_string_first_byte (input, re_string_cur_idx (input)))
1793 token->type = CHARACTER;
1794 token->mb_partial = 1;
1801 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1803 token->type = BACK_SLASH;
1807 c2 = re_string_peek_byte_case (input, 1);
1809 token->type = CHARACTER;
1810 #ifdef RE_ENABLE_I18N
1811 if (input->mb_cur_max > 1)
1813 wint_t wc = re_string_wchar_at (input,
1814 re_string_cur_idx (input) + 1);
1815 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1819 token->word_char = IS_WORD_CHAR (c2) != 0;
1824 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1825 token->type = OP_ALT;
1827 case '1': case '2': case '3': case '4': case '5':
1828 case '6': case '7': case '8': case '9':
1829 if (!(syntax & RE_NO_BK_REFS))
1831 token->type = OP_BACK_REF;
1832 token->opr.idx = c2 - '1';
1836 if (!(syntax & RE_NO_GNU_OPS))
1838 token->type = ANCHOR;
1839 token->opr.ctx_type = WORD_FIRST;
1843 if (!(syntax & RE_NO_GNU_OPS))
1845 token->type = ANCHOR;
1846 token->opr.ctx_type = WORD_LAST;
1850 if (!(syntax & RE_NO_GNU_OPS))
1852 token->type = ANCHOR;
1853 token->opr.ctx_type = WORD_DELIM;
1857 if (!(syntax & RE_NO_GNU_OPS))
1859 token->type = ANCHOR;
1860 token->opr.ctx_type = NOT_WORD_DELIM;
1864 if (!(syntax & RE_NO_GNU_OPS))
1865 token->type = OP_WORD;
1868 if (!(syntax & RE_NO_GNU_OPS))
1869 token->type = OP_NOTWORD;
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = OP_SPACE;
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_NOTSPACE;
1880 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = BUF_FIRST;
1887 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = BUF_LAST;
1894 if (!(syntax & RE_NO_BK_PARENS))
1895 token->type = OP_OPEN_SUBEXP;
1898 if (!(syntax & RE_NO_BK_PARENS))
1899 token->type = OP_CLOSE_SUBEXP;
1902 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1903 token->type = OP_DUP_PLUS;
1906 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907 token->type = OP_DUP_QUESTION;
1910 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1911 token->type = OP_OPEN_DUP_NUM;
1914 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915 token->type = OP_CLOSE_DUP_NUM;
1923 token->type = CHARACTER;
1924 #ifdef RE_ENABLE_I18N
1925 if (input->mb_cur_max > 1)
1927 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1928 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1932 token->word_char = IS_WORD_CHAR (token->opr.c);
1937 if (syntax & RE_NEWLINE_ALT)
1938 token->type = OP_ALT;
1941 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1942 token->type = OP_ALT;
1945 token->type = OP_DUP_ASTERISK;
1948 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1949 token->type = OP_DUP_PLUS;
1952 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953 token->type = OP_DUP_QUESTION;
1956 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1957 token->type = OP_OPEN_DUP_NUM;
1960 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961 token->type = OP_CLOSE_DUP_NUM;
1964 if (syntax & RE_NO_BK_PARENS)
1965 token->type = OP_OPEN_SUBEXP;
1968 if (syntax & RE_NO_BK_PARENS)
1969 token->type = OP_CLOSE_SUBEXP;
1972 token->type = OP_OPEN_BRACKET;
1975 token->type = OP_PERIOD;
1978 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1979 re_string_cur_idx (input) != 0)
1981 char prev = re_string_peek_byte (input, -1);
1982 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1985 token->type = ANCHOR;
1986 token->opr.ctx_type = LINE_FIRST;
1989 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1990 re_string_cur_idx (input) + 1 != re_string_length (input))
1993 re_string_skip_bytes (input, 1);
1994 peek_token (&next, input, syntax);
1995 re_string_skip_bytes (input, -1);
1996 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1999 token->type = ANCHOR;
2000 token->opr.ctx_type = LINE_LAST;
2008 /* Peek a token from INPUT, and return the length of the token.
2009 We must not use this function out of bracket expressions. */
2013 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2016 if (re_string_eoi (input))
2018 token->type = END_OF_RE;
2021 c = re_string_peek_byte (input, 0);
2024 #ifdef RE_ENABLE_I18N
2025 if (input->mb_cur_max > 1 &&
2026 !re_string_first_byte (input, re_string_cur_idx (input)))
2028 token->type = CHARACTER;
2031 #endif /* RE_ENABLE_I18N */
2033 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2034 && re_string_cur_idx (input) + 1 < re_string_length (input))
2036 /* In this case, '\' escape a character. */
2038 re_string_skip_bytes (input, 1);
2039 c2 = re_string_peek_byte (input, 0);
2041 token->type = CHARACTER;
2044 if (c == '[') /* '[' is a special char in a bracket exps. */
2048 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2049 c2 = re_string_peek_byte (input, 1);
2057 token->type = OP_OPEN_COLL_ELEM;
2060 token->type = OP_OPEN_EQUIV_CLASS;
2063 if (syntax & RE_CHAR_CLASSES)
2065 token->type = OP_OPEN_CHAR_CLASS;
2068 /* else fall through. */
2070 token->type = CHARACTER;
2080 token->type = OP_CHARSET_RANGE;
2083 token->type = OP_CLOSE_BRACKET;
2086 token->type = OP_NON_MATCH_LIST;
2089 token->type = CHARACTER;
2094 /* Functions for parser. */
2096 /* Entry point of the parser.
2097 Parse the regular expression REGEXP and return the structure tree.
2098 If an error is occured, ERR is set by error code, and return NULL.
2099 This function build the following tree, from regular expression <reg_exp>:
2105 CAT means concatenation.
2106 EOR means end of regular expression. */
2109 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2112 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113 bin_tree_t *tree, *eor, *root;
2114 re_token_t current_token;
2115 dfa->syntax = syntax;
2116 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2117 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2118 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2120 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2122 root = create_tree (dfa, tree, eor, CONCAT);
2125 if (BE (eor == NULL || root == NULL, 0))
2133 /* This function build the following tree, from regular expression
2134 <branch1>|<branch2>:
2140 ALT means alternative, which represents the operator `|'. */
2143 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2144 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2146 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147 bin_tree_t *tree, *branch = NULL;
2148 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2149 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2152 while (token->type == OP_ALT)
2154 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155 if (token->type != OP_ALT && token->type != END_OF_RE
2156 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2158 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2159 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2164 tree = create_tree (dfa, tree, branch, OP_ALT);
2165 if (BE (tree == NULL, 0))
2174 /* This function build the following tree, from regular expression
2181 CAT means concatenation. */
2184 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2185 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2187 bin_tree_t *tree, *expr;
2188 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2189 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2190 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193 while (token->type != OP_ALT && token->type != END_OF_RE
2194 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2196 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2197 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2201 if (tree != NULL && expr != NULL)
2203 tree = create_tree (dfa, tree, expr, CONCAT);
2210 else if (tree == NULL)
2212 /* Otherwise expr == NULL, we don't need to create new tree. */
2217 /* This function build the following tree, from regular expression a*:
2224 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2227 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2229 switch (token->type)
2232 tree = create_token_tree (dfa, NULL, NULL, token);
2233 if (BE (tree == NULL, 0))
2238 #ifdef RE_ENABLE_I18N
2239 if (dfa->mb_cur_max > 1)
2241 while (!re_string_eoi (regexp)
2242 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2244 bin_tree_t *mbc_remain;
2245 fetch_token (token, regexp, syntax);
2246 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248 if (BE (mbc_remain == NULL || tree == NULL, 0))
2257 case OP_OPEN_SUBEXP:
2258 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262 case OP_OPEN_BRACKET:
2263 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2268 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2273 dfa->used_bkref_map |= 1 << token->opr.idx;
2274 tree = create_token_tree (dfa, NULL, NULL, token);
2275 if (BE (tree == NULL, 0))
2281 dfa->has_mb_node = 1;
2283 case OP_OPEN_DUP_NUM:
2284 if (syntax & RE_CONTEXT_INVALID_DUP)
2290 case OP_DUP_ASTERISK:
2292 case OP_DUP_QUESTION:
2293 if (syntax & RE_CONTEXT_INVALID_OPS)
2298 else if (syntax & RE_CONTEXT_INDEP_OPS)
2300 fetch_token (token, regexp, syntax);
2301 return parse_expression (regexp, preg, token, syntax, nest, err);
2303 /* else fall through */
2304 case OP_CLOSE_SUBEXP:
2305 if ((token->type == OP_CLOSE_SUBEXP) &&
2306 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2311 /* else fall through */
2312 case OP_CLOSE_DUP_NUM:
2313 /* We treat it as a normal character. */
2315 /* Then we can these characters as normal characters. */
2316 token->type = CHARACTER;
2317 /* mb_partial and word_char bits should be initialized already
2319 tree = create_token_tree (dfa, NULL, NULL, token);
2320 if (BE (tree == NULL, 0))
2327 if ((token->opr.ctx_type
2328 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329 && dfa->word_ops_used == 0)
2330 init_word_char (dfa);
2331 if (token->opr.ctx_type == WORD_DELIM
2332 || token->opr.ctx_type == NOT_WORD_DELIM)
2334 bin_tree_t *tree_first, *tree_last;
2335 if (token->opr.ctx_type == WORD_DELIM)
2337 token->opr.ctx_type = WORD_FIRST;
2338 tree_first = create_token_tree (dfa, NULL, NULL, token);
2339 token->opr.ctx_type = WORD_LAST;
2343 token->opr.ctx_type = INSIDE_WORD;
2344 tree_first = create_token_tree (dfa, NULL, NULL, token);
2345 token->opr.ctx_type = INSIDE_NOTWORD;
2347 tree_last = create_token_tree (dfa, NULL, NULL, token);
2348 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2357 tree = create_token_tree (dfa, NULL, NULL, token);
2358 if (BE (tree == NULL, 0))
2364 /* We must return here, since ANCHORs can't be followed
2365 by repetition operators.
2366 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2368 fetch_token (token, regexp, syntax);
2371 tree = create_token_tree (dfa, NULL, NULL, token);
2372 if (BE (tree == NULL, 0))
2377 if (dfa->mb_cur_max > 1)
2378 dfa->has_mb_node = 1;
2382 tree = build_charclass_op (dfa, regexp->trans,
2383 (const unsigned char *) "alnum",
2384 (const unsigned char *) "_",
2385 token->type == OP_NOTWORD, err);
2386 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2391 tree = build_charclass_op (dfa, regexp->trans,
2392 (const unsigned char *) "space",
2393 (const unsigned char *) "",
2394 token->type == OP_NOTSPACE, err);
2395 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2405 /* Must not happen? */
2411 fetch_token (token, regexp, syntax);
2413 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2416 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2419 /* In BRE consecutive duplications are not allowed. */
2420 if ((syntax & RE_CONTEXT_INVALID_DUP)
2421 && (token->type == OP_DUP_ASTERISK
2422 || token->type == OP_OPEN_DUP_NUM))
2432 /* This function build the following tree, from regular expression
2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2443 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2446 cur_nsub = preg->re_nsub++;
2448 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2450 /* The subexpression may be a null string. */
2451 if (token->type == OP_CLOSE_SUBEXP)
2455 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2458 if (BE (*err != REG_NOERROR, 0))
2462 if (cur_nsub <= '9' - '1')
2463 dfa->completed_bkref_map |= 1 << cur_nsub;
2465 tree = create_tree (dfa, tree, NULL, SUBEXP);
2466 if (BE (tree == NULL, 0))
2471 tree->token.opr.idx = cur_nsub;
2475 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2478 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2479 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2481 bin_tree_t *tree = NULL, *old_tree = NULL;
2482 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2483 re_token_t start_token = *token;
2485 if (token->type == OP_OPEN_DUP_NUM)
2488 start = fetch_number (regexp, token, syntax);
2489 if (start == REG_MISSING)
2491 if (token->type == CHARACTER && token->opr.c == ',')
2492 start = 0; /* We treat "{,m}" as "{0,m}". */
2495 *err = REG_BADBR; /* <re>{} is invalid. */
2499 if (BE (start != REG_ERROR, 1))
2501 /* We treat "{n}" as "{n,n}". */
2502 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2503 : ((token->type == CHARACTER && token->opr.c == ',')
2504 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2506 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2508 /* Invalid sequence. */
2509 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2511 if (token->type == END_OF_RE)
2519 /* If the syntax bit is set, rollback. */
2520 re_string_set_index (regexp, start_idx);
2521 *token = start_token;
2522 token->type = CHARACTER;
2523 /* mb_partial and word_char bits should be already initialized by
2528 if (BE ((end != REG_MISSING && start > end)
2529 || token->type != OP_CLOSE_DUP_NUM, 0))
2531 /* First number greater than second. */
2538 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2539 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2542 fetch_token (token, regexp, syntax);
2544 if (BE (elem == NULL, 0))
2546 if (BE (start == 0 && end == 0, 0))
2548 postorder (elem, free_tree, NULL);
2552 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2553 if (BE (start > 0, 0))
2556 for (i = 2; i <= start; ++i)
2558 elem = duplicate_tree (elem, dfa);
2559 tree = create_tree (dfa, tree, elem, CONCAT);
2560 if (BE (elem == NULL || tree == NULL, 0))
2561 goto parse_dup_op_espace;
2567 /* Duplicate ELEM before it is marked optional. */
2568 elem = duplicate_tree (elem, dfa);
2574 if (elem->token.type == SUBEXP)
2575 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2577 tree = create_tree (dfa, elem, NULL,
2578 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2579 if (BE (tree == NULL, 0))
2580 goto parse_dup_op_espace;
2582 /* From gnulib's "intprops.h":
2583 True if the arithmetic type T is signed. */
2584 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2586 /* This loop is actually executed only when end != REG_MISSING,
2587 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2588 already created the start+1-th copy. */
2589 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2590 for (i = start + 2; i <= end; ++i)
2592 elem = duplicate_tree (elem, dfa);
2593 tree = create_tree (dfa, tree, elem, CONCAT);
2594 if (BE (elem == NULL || tree == NULL, 0))
2595 goto parse_dup_op_espace;
2597 tree = create_tree (dfa, tree, NULL, OP_ALT);
2598 if (BE (tree == NULL, 0))
2599 goto parse_dup_op_espace;
2603 tree = create_tree (dfa, old_tree, tree, CONCAT);
2607 parse_dup_op_espace:
2612 /* Size of the names for collating symbol/equivalence_class/character_class.
2613 I'm not sure, but maybe enough. */
2614 #define BRACKET_NAME_BUF_SIZE 32
2617 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2618 Build the range expression which starts from START_ELEM, and ends
2619 at END_ELEM. The result are written to MBCSET and SBCSET.
2620 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2621 mbcset->range_ends, is a pointer argument sinse we may
2624 static reg_errcode_t
2626 # ifdef RE_ENABLE_I18N
2627 build_range_exp (const reg_syntax_t syntax,
2629 re_charset_t *mbcset,
2631 const bracket_elem_t *start_elem,
2632 const bracket_elem_t *end_elem)
2633 # else /* not RE_ENABLE_I18N */
2634 build_range_exp (const reg_syntax_t syntax,
2636 const bracket_elem_t *start_elem,
2637 const bracket_elem_t *end_elem)
2638 # endif /* not RE_ENABLE_I18N */
2640 unsigned int start_ch, end_ch;
2641 /* Equivalence Classes and Character Classes can't be a range start/end. */
2642 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2643 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2647 /* We can handle no multi character collating elements without libc
2649 if (BE ((start_elem->type == COLL_SYM
2650 && strlen ((char *) start_elem->opr.name) > 1)
2651 || (end_elem->type == COLL_SYM
2652 && strlen ((char *) end_elem->opr.name) > 1), 0))
2653 return REG_ECOLLATE;
2655 # ifdef RE_ENABLE_I18N
2660 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2662 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2663 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2665 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2666 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2668 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2669 ? __btowc (start_ch) : start_elem->opr.wch);
2670 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2671 ? __btowc (end_ch) : end_elem->opr.wch);
2672 if (start_wc == WEOF || end_wc == WEOF)
2673 return REG_ECOLLATE;
2674 cmp_buf[0] = start_wc;
2675 cmp_buf[4] = end_wc;
2677 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2678 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2681 /* Got valid collation sequence values, add them as a new entry.
2682 However, for !_LIBC we have no collation elements: if the
2683 character set is single byte, the single byte character set
2684 that we build below suffices. parse_bracket_exp passes
2685 no MBCSET if dfa->mb_cur_max == 1. */
2688 /* Check the space of the arrays. */
2689 if (BE (*range_alloc == mbcset->nranges, 0))
2691 /* There is not enough space, need realloc. */
2692 wchar_t *new_array_start, *new_array_end;
2695 /* +1 in case of mbcset->nranges is 0. */
2696 new_nranges = 2 * mbcset->nranges + 1;
2697 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2698 are NULL if *range_alloc == 0. */
2699 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2701 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2704 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2707 mbcset->range_starts = new_array_start;
2708 mbcset->range_ends = new_array_end;
2709 *range_alloc = new_nranges;
2712 mbcset->range_starts[mbcset->nranges] = start_wc;
2713 mbcset->range_ends[mbcset->nranges++] = end_wc;
2716 /* Build the table for single byte characters. */
2717 for (wc = 0; wc < SBC_MAX; ++wc)
2720 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2721 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2722 bitset_set (sbcset, wc);
2725 # else /* not RE_ENABLE_I18N */
2728 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2729 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2731 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2732 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2734 if (start_ch > end_ch)
2736 /* Build the table for single byte characters. */
2737 for (ch = 0; ch < SBC_MAX; ++ch)
2738 if (start_ch <= ch && ch <= end_ch)
2739 bitset_set (sbcset, ch);
2741 # endif /* not RE_ENABLE_I18N */
2744 #endif /* not _LIBC */
2747 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2748 Build the collating element which is represented by NAME.
2749 The result are written to MBCSET and SBCSET.
2750 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2751 pointer argument since we may update it. */
2753 static reg_errcode_t
2755 build_collating_symbol (bitset_t sbcset,
2756 # ifdef RE_ENABLE_I18N
2757 re_charset_t *mbcset, Idx *coll_sym_alloc,
2759 const unsigned char *name)
2761 size_t name_len = strlen ((const char *) name);
2762 if (BE (name_len != 1, 0))
2763 return REG_ECOLLATE;
2766 bitset_set (sbcset, name[0]);
2770 #endif /* not _LIBC */
2772 /* This function parse bracket expression like "[abc]", "[a-c]",
2776 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2777 reg_syntax_t syntax, reg_errcode_t *err)
2780 const unsigned char *collseqmb;
2781 const char *collseqwc;
2784 const int32_t *symb_table;
2785 const unsigned char *extra;
2787 /* Local function for parse_bracket_exp used in _LIBC environement.
2788 Seek the collating symbol entry correspondings to NAME.
2789 Return the index of the symbol in the SYMB_TABLE. */
2792 __attribute ((always_inline))
2793 seek_collating_symbol_entry (name, name_len)
2794 const unsigned char *name;
2797 int32_t hash = elem_hash ((const char *) name, name_len);
2798 int32_t elem = hash % table_size;
2799 if (symb_table[2 * elem] != 0)
2801 int32_t second = hash % (table_size - 2) + 1;
2805 /* First compare the hashing value. */
2806 if (symb_table[2 * elem] == hash
2807 /* Compare the length of the name. */
2808 && name_len == extra[symb_table[2 * elem + 1]]
2809 /* Compare the name. */
2810 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2813 /* Yep, this is the entry. */
2820 while (symb_table[2 * elem] != 0);
2825 /* Local function for parse_bracket_exp used in _LIBC environment.
2826 Look up the collation sequence value of BR_ELEM.
2827 Return the value if succeeded, UINT_MAX otherwise. */
2829 auto inline unsigned int
2830 __attribute ((always_inline))
2831 lookup_collation_sequence_value (br_elem)
2832 bracket_elem_t *br_elem;
2834 if (br_elem->type == SB_CHAR)
2837 if (MB_CUR_MAX == 1)
2840 return collseqmb[br_elem->opr.ch];
2843 wint_t wc = __btowc (br_elem->opr.ch);
2844 return __collseq_table_lookup (collseqwc, wc);
2847 else if (br_elem->type == MB_CHAR)
2850 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2852 else if (br_elem->type == COLL_SYM)
2854 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2858 elem = seek_collating_symbol_entry (br_elem->opr.name,
2860 if (symb_table[2 * elem] != 0)
2862 /* We found the entry. */
2863 idx = symb_table[2 * elem + 1];
2864 /* Skip the name of collating element name. */
2865 idx += 1 + extra[idx];
2866 /* Skip the byte sequence of the collating element. */
2867 idx += 1 + extra[idx];
2868 /* Adjust for the alignment. */
2869 idx = (idx + 3) & ~3;
2870 /* Skip the multibyte collation sequence value. */
2871 idx += sizeof (unsigned int);
2872 /* Skip the wide char sequence of the collating element. */
2873 idx += sizeof (unsigned int) *
2874 (1 + *(unsigned int *) (extra + idx));
2875 /* Return the collation sequence value. */
2876 return *(unsigned int *) (extra + idx);
2878 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2880 /* No valid character. Match it as a single byte
2882 return collseqmb[br_elem->opr.name[0]];
2885 else if (sym_name_len == 1)
2886 return collseqmb[br_elem->opr.name[0]];
2891 /* Local function for parse_bracket_exp used in _LIBC environement.
2892 Build the range expression which starts from START_ELEM, and ends
2893 at END_ELEM. The result are written to MBCSET and SBCSET.
2894 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2895 mbcset->range_ends, is a pointer argument sinse we may
2898 auto inline reg_errcode_t
2899 __attribute ((always_inline))
2900 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2901 re_charset_t *mbcset;
2904 bracket_elem_t *start_elem, *end_elem;
2907 uint32_t start_collseq;
2908 uint32_t end_collseq;
2910 /* Equivalence Classes and Character Classes can't be a range
2912 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2913 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2917 start_collseq = lookup_collation_sequence_value (start_elem);
2918 end_collseq = lookup_collation_sequence_value (end_elem);
2919 /* Check start/end collation sequence values. */
2920 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2921 return REG_ECOLLATE;
2922 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2925 /* Got valid collation sequence values, add them as a new entry.
2926 However, if we have no collation elements, and the character set
2927 is single byte, the single byte character set that we
2928 build below suffices. */
2929 if (nrules > 0 || dfa->mb_cur_max > 1)
2931 /* Check the space of the arrays. */
2932 if (BE (*range_alloc == mbcset->nranges, 0))
2934 /* There is not enough space, need realloc. */
2935 uint32_t *new_array_start;
2936 uint32_t *new_array_end;
2939 /* +1 in case of mbcset->nranges is 0. */
2940 new_nranges = 2 * mbcset->nranges + 1;
2941 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2943 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2946 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2949 mbcset->range_starts = new_array_start;
2950 mbcset->range_ends = new_array_end;
2951 *range_alloc = new_nranges;
2954 mbcset->range_starts[mbcset->nranges] = start_collseq;
2955 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2958 /* Build the table for single byte characters. */
2959 for (ch = 0; ch < SBC_MAX; ch++)
2961 uint32_t ch_collseq;
2963 if (MB_CUR_MAX == 1)
2966 ch_collseq = collseqmb[ch];
2968 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2969 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2970 bitset_set (sbcset, ch);
2975 /* Local function for parse_bracket_exp used in _LIBC environement.
2976 Build the collating element which is represented by NAME.
2977 The result are written to MBCSET and SBCSET.
2978 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2979 pointer argument sinse we may update it. */
2981 auto inline reg_errcode_t
2982 __attribute ((always_inline))
2983 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2984 re_charset_t *mbcset;
2985 Idx *coll_sym_alloc;
2987 const unsigned char *name;
2990 size_t name_len = strlen ((const char *) name);
2993 elem = seek_collating_symbol_entry (name, name_len);
2994 if (symb_table[2 * elem] != 0)
2996 /* We found the entry. */
2997 idx = symb_table[2 * elem + 1];
2998 /* Skip the name of collating element name. */
2999 idx += 1 + extra[idx];
3001 else if (symb_table[2 * elem] == 0 && name_len == 1)
3003 /* No valid character, treat it as a normal
3005 bitset_set (sbcset, name[0]);
3009 return REG_ECOLLATE;
3011 /* Got valid collation sequence, add it as a new entry. */
3012 /* Check the space of the arrays. */
3013 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3015 /* Not enough, realloc it. */
3016 /* +1 in case of mbcset->ncoll_syms is 0. */
3017 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3018 /* Use realloc since mbcset->coll_syms is NULL
3020 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3021 new_coll_sym_alloc);
3022 if (BE (new_coll_syms == NULL, 0))
3024 mbcset->coll_syms = new_coll_syms;
3025 *coll_sym_alloc = new_coll_sym_alloc;
3027 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3032 if (BE (name_len != 1, 0))
3033 return REG_ECOLLATE;
3036 bitset_set (sbcset, name[0]);
3043 re_token_t br_token;
3044 re_bitset_ptr_t sbcset;
3045 #ifdef RE_ENABLE_I18N
3046 re_charset_t *mbcset;
3047 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3048 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3049 #endif /* not RE_ENABLE_I18N */
3050 bool non_match = false;
3051 bin_tree_t *work_tree;
3053 bool first_round = true;
3055 collseqmb = (const unsigned char *)
3056 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3057 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3063 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3064 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3065 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3066 _NL_COLLATE_SYMB_TABLEMB);
3067 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3068 _NL_COLLATE_SYMB_EXTRAMB);
3071 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3072 #ifdef RE_ENABLE_I18N
3073 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3074 #endif /* RE_ENABLE_I18N */
3075 #ifdef RE_ENABLE_I18N
3076 if (BE (sbcset == NULL || mbcset == NULL, 0))
3078 if (BE (sbcset == NULL, 0))
3079 #endif /* RE_ENABLE_I18N */
3085 token_len = peek_token_bracket (token, regexp, syntax);
3086 if (BE (token->type == END_OF_RE, 0))
3089 goto parse_bracket_exp_free_return;
3091 if (token->type == OP_NON_MATCH_LIST)
3093 #ifdef RE_ENABLE_I18N
3094 mbcset->non_match = 1;
3095 #endif /* not RE_ENABLE_I18N */
3097 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3098 bitset_set (sbcset, '\n');
3099 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3100 token_len = peek_token_bracket (token, regexp, syntax);
3101 if (BE (token->type == END_OF_RE, 0))
3104 goto parse_bracket_exp_free_return;
3108 /* We treat the first ']' as a normal character. */
3109 if (token->type == OP_CLOSE_BRACKET)
3110 token->type = CHARACTER;
3114 bracket_elem_t start_elem, end_elem;
3115 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3116 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3119 bool is_range_exp = false;
3122 start_elem.opr.name = start_name_buf;
3123 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3124 syntax, first_round);
3125 if (BE (ret != REG_NOERROR, 0))
3128 goto parse_bracket_exp_free_return;
3130 first_round = false;
3132 /* Get information about the next token. We need it in any case. */
3133 token_len = peek_token_bracket (token, regexp, syntax);
3135 /* Do not check for ranges if we know they are not allowed. */
3136 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3138 if (BE (token->type == END_OF_RE, 0))
3141 goto parse_bracket_exp_free_return;
3143 if (token->type == OP_CHARSET_RANGE)
3145 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3146 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3147 if (BE (token2.type == END_OF_RE, 0))
3150 goto parse_bracket_exp_free_return;
3152 if (token2.type == OP_CLOSE_BRACKET)
3154 /* We treat the last '-' as a normal character. */
3155 re_string_skip_bytes (regexp, -token_len);
3156 token->type = CHARACTER;
3159 is_range_exp = true;
3163 if (is_range_exp == true)
3165 end_elem.opr.name = end_name_buf;
3166 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3168 if (BE (ret != REG_NOERROR, 0))
3171 goto parse_bracket_exp_free_return;
3174 token_len = peek_token_bracket (token, regexp, syntax);
3177 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3178 &start_elem, &end_elem);
3180 # ifdef RE_ENABLE_I18N
3181 *err = build_range_exp (syntax, sbcset,
3182 dfa->mb_cur_max > 1 ? mbcset : NULL,
3183 &range_alloc, &start_elem, &end_elem);
3185 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3187 #endif /* RE_ENABLE_I18N */
3188 if (BE (*err != REG_NOERROR, 0))
3189 goto parse_bracket_exp_free_return;
3193 switch (start_elem.type)
3196 bitset_set (sbcset, start_elem.opr.ch);
3198 #ifdef RE_ENABLE_I18N
3200 /* Check whether the array has enough space. */
3201 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3203 wchar_t *new_mbchars;
3204 /* Not enough, realloc it. */
3205 /* +1 in case of mbcset->nmbchars is 0. */
3206 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3207 /* Use realloc since array is NULL if *alloc == 0. */
3208 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3210 if (BE (new_mbchars == NULL, 0))
3211 goto parse_bracket_exp_espace;
3212 mbcset->mbchars = new_mbchars;
3214 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3216 #endif /* RE_ENABLE_I18N */
3218 *err = build_equiv_class (sbcset,
3219 #ifdef RE_ENABLE_I18N
3220 mbcset, &equiv_class_alloc,
3221 #endif /* RE_ENABLE_I18N */
3222 start_elem.opr.name);
3223 if (BE (*err != REG_NOERROR, 0))
3224 goto parse_bracket_exp_free_return;
3227 *err = build_collating_symbol (sbcset,
3228 #ifdef RE_ENABLE_I18N
3229 mbcset, &coll_sym_alloc,
3230 #endif /* RE_ENABLE_I18N */
3231 start_elem.opr.name);
3232 if (BE (*err != REG_NOERROR, 0))
3233 goto parse_bracket_exp_free_return;
3236 *err = build_charclass (regexp->trans, sbcset,
3237 #ifdef RE_ENABLE_I18N
3238 mbcset, &char_class_alloc,
3239 #endif /* RE_ENABLE_I18N */
3240 start_elem.opr.name, syntax);
3241 if (BE (*err != REG_NOERROR, 0))
3242 goto parse_bracket_exp_free_return;
3249 if (BE (token->type == END_OF_RE, 0))
3252 goto parse_bracket_exp_free_return;
3254 if (token->type == OP_CLOSE_BRACKET)
3258 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3260 /* If it is non-matching list. */
3262 bitset_not (sbcset);
3264 #ifdef RE_ENABLE_I18N
3265 /* Ensure only single byte characters are set. */
3266 if (dfa->mb_cur_max > 1)
3267 bitset_mask (sbcset, dfa->sb_char);
3269 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3270 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3271 || mbcset->non_match)))
3273 bin_tree_t *mbc_tree;
3275 /* Build a tree for complex bracket. */
3276 dfa->has_mb_node = 1;
3277 br_token.type = COMPLEX_BRACKET;
3278 br_token.opr.mbcset = mbcset;
3279 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3280 if (BE (mbc_tree == NULL, 0))
3281 goto parse_bracket_exp_espace;
3282 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3283 if (sbcset[sbc_idx])
3285 /* If there are no bits set in sbcset, there is no point
3286 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3287 if (sbc_idx < BITSET_WORDS)
3289 /* Build a tree for simple bracket. */
3290 br_token.type = SIMPLE_BRACKET;
3291 br_token.opr.sbcset = sbcset;
3292 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3293 if (BE (work_tree == NULL, 0))
3294 goto parse_bracket_exp_espace;
3296 /* Then join them by ALT node. */
3297 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3298 if (BE (work_tree == NULL, 0))
3299 goto parse_bracket_exp_espace;
3304 work_tree = mbc_tree;
3308 #endif /* not RE_ENABLE_I18N */
3310 #ifdef RE_ENABLE_I18N
3311 free_charset (mbcset);
3313 /* Build a tree for simple bracket. */
3314 br_token.type = SIMPLE_BRACKET;
3315 br_token.opr.sbcset = sbcset;
3316 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3317 if (BE (work_tree == NULL, 0))
3318 goto parse_bracket_exp_espace;
3322 parse_bracket_exp_espace:
3324 parse_bracket_exp_free_return:
3326 #ifdef RE_ENABLE_I18N
3327 free_charset (mbcset);
3328 #endif /* RE_ENABLE_I18N */
3332 /* Parse an element in the bracket expression. */
3334 static reg_errcode_t
3335 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3336 re_token_t *token, int token_len, re_dfa_t *dfa,
3337 reg_syntax_t syntax, bool accept_hyphen)
3339 #ifdef RE_ENABLE_I18N
3341 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3342 if (cur_char_size > 1)
3344 elem->type = MB_CHAR;
3345 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3346 re_string_skip_bytes (regexp, cur_char_size);
3349 #endif /* RE_ENABLE_I18N */
3350 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3351 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3352 || token->type == OP_OPEN_EQUIV_CLASS)
3353 return parse_bracket_symbol (elem, regexp, token);
3354 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3356 /* A '-' must only appear as anything but a range indicator before
3357 the closing bracket. Everything else is an error. */
3359 (void) peek_token_bracket (&token2, regexp, syntax);
3360 if (token2.type != OP_CLOSE_BRACKET)
3361 /* The actual error value is not standardized since this whole
3362 case is undefined. But ERANGE makes good sense. */
3365 elem->type = SB_CHAR;
3366 elem->opr.ch = token->opr.c;
3370 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3371 such as [:<character_class>:], [.<collating_element>.], and
3372 [=<equivalent_class>=]. */
3374 static reg_errcode_t
3375 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3378 unsigned char ch, delim = token->opr.c;
3380 if (re_string_eoi(regexp))
3384 if (i >= BRACKET_NAME_BUF_SIZE)
3386 if (token->type == OP_OPEN_CHAR_CLASS)
3387 ch = re_string_fetch_byte_case (regexp);
3389 ch = re_string_fetch_byte (regexp);
3390 if (re_string_eoi(regexp))
3392 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3394 elem->opr.name[i] = ch;
3396 re_string_skip_bytes (regexp, 1);
3397 elem->opr.name[i] = '\0';
3398 switch (token->type)
3400 case OP_OPEN_COLL_ELEM:
3401 elem->type = COLL_SYM;
3403 case OP_OPEN_EQUIV_CLASS:
3404 elem->type = EQUIV_CLASS;
3406 case OP_OPEN_CHAR_CLASS:
3407 elem->type = CHAR_CLASS;
3415 /* Helper function for parse_bracket_exp.
3416 Build the equivalence class which is represented by NAME.
3417 The result are written to MBCSET and SBCSET.
3418 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3419 is a pointer argument sinse we may update it. */
3421 static reg_errcode_t
3422 #ifdef RE_ENABLE_I18N
3423 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3424 Idx *equiv_class_alloc, const unsigned char *name)
3425 #else /* not RE_ENABLE_I18N */
3426 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3427 #endif /* not RE_ENABLE_I18N */
3430 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3433 const int32_t *table, *indirect;
3434 const unsigned char *weights, *extra, *cp;
3435 unsigned char char_buf[2];
3439 /* This #include defines a local function! */
3440 # include <locale/weight.h>
3441 /* Calculate the index for equivalence class. */
3443 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3444 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3445 _NL_COLLATE_WEIGHTMB);
3446 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3447 _NL_COLLATE_EXTRAMB);
3448 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3449 _NL_COLLATE_INDIRECTMB);
3450 idx1 = findidx (&cp);
3451 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3452 /* This isn't a valid character. */
3453 return REG_ECOLLATE;
3455 /* Build single byte matcing table for this equivalence class. */
3456 char_buf[1] = (unsigned char) '\0';
3457 len = weights[idx1 & 0xffffff];
3458 for (ch = 0; ch < SBC_MAX; ++ch)
3462 idx2 = findidx (&cp);
3467 /* This isn't a valid character. */
3469 /* Compare only if the length matches and the collation rule
3470 index is the same. */
3471 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3475 while (cnt <= len &&
3476 weights[(idx1 & 0xffffff) + 1 + cnt]
3477 == weights[(idx2 & 0xffffff) + 1 + cnt])
3481 bitset_set (sbcset, ch);
3484 /* Check whether the array has enough space. */
3485 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3487 /* Not enough, realloc it. */
3488 /* +1 in case of mbcset->nequiv_classes is 0. */
3489 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3490 /* Use realloc since the array is NULL if *alloc == 0. */
3491 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3493 new_equiv_class_alloc);
3494 if (BE (new_equiv_classes == NULL, 0))
3496 mbcset->equiv_classes = new_equiv_classes;
3497 *equiv_class_alloc = new_equiv_class_alloc;
3499 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3504 if (BE (strlen ((const char *) name) != 1, 0))
3505 return REG_ECOLLATE;
3506 bitset_set (sbcset, *name);
3511 /* Helper function for parse_bracket_exp.
3512 Build the character class which is represented by NAME.
3513 The result are written to MBCSET and SBCSET.
3514 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3515 is a pointer argument sinse we may update it. */
3517 static reg_errcode_t
3518 #ifdef RE_ENABLE_I18N
3519 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3520 re_charset_t *mbcset, Idx *char_class_alloc,
3521 const unsigned char *class_name, reg_syntax_t syntax)
3522 #else /* not RE_ENABLE_I18N */
3523 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3524 const unsigned char *class_name, reg_syntax_t syntax)
3525 #endif /* not RE_ENABLE_I18N */
3528 const char *name = (const char *) class_name;
3530 /* In case of REG_ICASE "upper" and "lower" match the both of
3531 upper and lower cases. */
3532 if ((syntax & RE_ICASE)
3533 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3536 #ifdef RE_ENABLE_I18N
3537 /* Check the space of the arrays. */
3538 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3540 /* Not enough, realloc it. */
3541 /* +1 in case of mbcset->nchar_classes is 0. */
3542 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3543 /* Use realloc since array is NULL if *alloc == 0. */
3544 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3545 new_char_class_alloc);
3546 if (BE (new_char_classes == NULL, 0))
3548 mbcset->char_classes = new_char_classes;
3549 *char_class_alloc = new_char_class_alloc;
3551 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3552 #endif /* RE_ENABLE_I18N */
3554 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3556 if (BE (trans != NULL, 0)) \
3558 for (i = 0; i < SBC_MAX; ++i) \
3559 if (ctype_func (i)) \
3560 bitset_set (sbcset, trans[i]); \
3564 for (i = 0; i < SBC_MAX; ++i) \
3565 if (ctype_func (i)) \
3566 bitset_set (sbcset, i); \
3570 if (strcmp (name, "alnum") == 0)
3571 BUILD_CHARCLASS_LOOP (isalnum);
3572 else if (strcmp (name, "cntrl") == 0)
3573 BUILD_CHARCLASS_LOOP (iscntrl);
3574 else if (strcmp (name, "lower") == 0)
3575 BUILD_CHARCLASS_LOOP (islower);
3576 else if (strcmp (name, "space") == 0)
3577 BUILD_CHARCLASS_LOOP (isspace);
3578 else if (strcmp (name, "alpha") == 0)
3579 BUILD_CHARCLASS_LOOP (isalpha);
3580 else if (strcmp (name, "digit") == 0)
3581 BUILD_CHARCLASS_LOOP (isdigit);
3582 else if (strcmp (name, "print") == 0)
3583 BUILD_CHARCLASS_LOOP (isprint);
3584 else if (strcmp (name, "upper") == 0)
3585 BUILD_CHARCLASS_LOOP (isupper);
3586 else if (strcmp (name, "blank") == 0)
3587 BUILD_CHARCLASS_LOOP (isblank);
3588 else if (strcmp (name, "graph") == 0)
3589 BUILD_CHARCLASS_LOOP (isgraph);
3590 else if (strcmp (name, "punct") == 0)
3591 BUILD_CHARCLASS_LOOP (ispunct);
3592 else if (strcmp (name, "xdigit") == 0)
3593 BUILD_CHARCLASS_LOOP (isxdigit);
3601 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3602 const unsigned char *class_name,
3603 const unsigned char *extra, bool non_match,
3606 re_bitset_ptr_t sbcset;
3607 #ifdef RE_ENABLE_I18N
3608 re_charset_t *mbcset;
3610 #endif /* not RE_ENABLE_I18N */
3612 re_token_t br_token;
3615 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3616 #ifdef RE_ENABLE_I18N
3617 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3618 #endif /* RE_ENABLE_I18N */
3620 #ifdef RE_ENABLE_I18N
3621 if (BE (sbcset == NULL || mbcset == NULL, 0))
3622 #else /* not RE_ENABLE_I18N */
3623 if (BE (sbcset == NULL, 0))
3624 #endif /* not RE_ENABLE_I18N */
3632 #ifdef RE_ENABLE_I18N
3633 mbcset->non_match = 1;
3634 #endif /* not RE_ENABLE_I18N */
3637 /* We don't care the syntax in this case. */
3638 ret = build_charclass (trans, sbcset,
3639 #ifdef RE_ENABLE_I18N
3641 #endif /* RE_ENABLE_I18N */
3644 if (BE (ret != REG_NOERROR, 0))
3647 #ifdef RE_ENABLE_I18N
3648 free_charset (mbcset);
3649 #endif /* RE_ENABLE_I18N */
3653 /* \w match '_' also. */
3654 for (; *extra; extra++)
3655 bitset_set (sbcset, *extra);
3657 /* If it is non-matching list. */
3659 bitset_not (sbcset);
3661 #ifdef RE_ENABLE_I18N
3662 /* Ensure only single byte characters are set. */
3663 if (dfa->mb_cur_max > 1)
3664 bitset_mask (sbcset, dfa->sb_char);
3667 /* Build a tree for simple bracket. */
3668 br_token.type = SIMPLE_BRACKET;
3669 br_token.opr.sbcset = sbcset;
3670 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3671 if (BE (tree == NULL, 0))
3672 goto build_word_op_espace;
3674 #ifdef RE_ENABLE_I18N
3675 if (dfa->mb_cur_max > 1)
3677 bin_tree_t *mbc_tree;
3678 /* Build a tree for complex bracket. */
3679 br_token.type = COMPLEX_BRACKET;
3680 br_token.opr.mbcset = mbcset;
3681 dfa->has_mb_node = 1;
3682 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3683 if (BE (mbc_tree == NULL, 0))
3684 goto build_word_op_espace;
3685 /* Then join them by ALT node. */
3686 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3687 if (BE (mbc_tree != NULL, 1))
3692 free_charset (mbcset);
3695 #else /* not RE_ENABLE_I18N */
3697 #endif /* not RE_ENABLE_I18N */
3699 build_word_op_espace:
3701 #ifdef RE_ENABLE_I18N
3702 free_charset (mbcset);
3703 #endif /* RE_ENABLE_I18N */
3708 /* This is intended for the expressions like "a{1,3}".
3709 Fetch a number from `input', and return the number.
3710 Return REG_MISSING if the number field is empty like "{,1}".
3711 Return REG_ERROR if an error occurred. */
3714 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3716 Idx num = REG_MISSING;
3720 fetch_token (token, input, syntax);
3722 if (BE (token->type == END_OF_RE, 0))
3724 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3726 num = ((token->type != CHARACTER || c < '0' || '9' < c
3727 || num == REG_ERROR)
3729 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3730 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3735 #ifdef RE_ENABLE_I18N
3737 free_charset (re_charset_t *cset)
3739 re_free (cset->mbchars);
3741 re_free (cset->coll_syms);
3742 re_free (cset->equiv_classes);
3743 re_free (cset->range_starts);
3744 re_free (cset->range_ends);
3746 re_free (cset->char_classes);
3749 #endif /* RE_ENABLE_I18N */
3751 /* Functions for binary tree operation. */
3753 /* Create a tree node. */
3756 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3757 re_token_type_t type)
3761 return create_token_tree (dfa, left, right, &t);
3765 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3766 const re_token_t *token)
3769 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3771 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3773 if (storage == NULL)
3775 storage->next = dfa->str_tree_storage;
3776 dfa->str_tree_storage = storage;
3777 dfa->str_tree_storage_idx = 0;
3779 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3781 tree->parent = NULL;
3783 tree->right = right;
3784 tree->token = *token;
3785 tree->token.duplicated = 0;
3786 tree->token.opt_subexp = 0;
3789 tree->node_idx = REG_MISSING;
3792 left->parent = tree;
3794 right->parent = tree;
3798 /* Mark the tree SRC as an optional subexpression.
3799 To be called from preorder or postorder. */
3801 static reg_errcode_t
3802 mark_opt_subexp (void *extra, bin_tree_t *node)
3804 Idx idx = (Idx) (long) extra;
3805 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3806 node->token.opt_subexp = 1;
3811 /* Free the allocated memory inside NODE. */
3814 free_token (re_token_t *node)
3816 #ifdef RE_ENABLE_I18N
3817 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3818 free_charset (node->opr.mbcset);
3820 #endif /* RE_ENABLE_I18N */
3821 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3822 re_free (node->opr.sbcset);
3825 /* Worker function for tree walking. Free the allocated memory inside NODE
3826 and its children. */
3828 static reg_errcode_t
3829 free_tree (void *extra, bin_tree_t *node)
3831 free_token (&node->token);
3836 /* Duplicate the node SRC, and return new node. This is a preorder
3837 visit similar to the one implemented by the generic visitor, but
3838 we need more infrastructure to maintain two parallel trees --- so,
3839 it's easier to duplicate. */
3842 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3844 const bin_tree_t *node;
3845 bin_tree_t *dup_root;
3846 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3848 for (node = root; ; )
3850 /* Create a new tree and link it back to the current parent. */
3851 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3854 (*p_new)->parent = dup_node;
3855 (*p_new)->token.duplicated = 1;
3858 /* Go to the left node, or up and to the right. */
3862 p_new = &dup_node->left;
3866 const bin_tree_t *prev = NULL;
3867 while (node->right == prev || node->right == NULL)
3870 node = node->parent;
3871 dup_node = dup_node->parent;
3876 p_new = &dup_node->right;