1 /**************************************************************************
3 * Copyright 2013-2014 RAD Game Tools and Valve Software
4 * Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 **************************************************************************/
27 // See http://pubs.opengroup.org/onlinepubs/7908799/xbd/re.html
28 // Or http://en.wikipedia.org/wiki/Regular_expression#POSIX_extended
29 // Or http://www.gnu.org/savannah-checkouts/gnu/libc/manual/html_node/Regular-Expressions.html#Regular-Expressions
32 From Wikipedia, because I'm lazy:
34 Most formalisms provide the following operations to construct regular expressions.
37 A vertical bar separates alternatives. For example, gray|grey can match "gray" or "grey".
40 Parentheses are used to define the scope and precedence of the
41 operators (among other uses). For example, gray|grey and gr(a|e)y are equivalent
42 patterns which both describe the set of "gray" or "grey".
45 A quantifier after a token (such as a character) or group specifies how often
46 that preceding element is allowed to occur. The most common quantifiers are the
47 question mark ?, the asterisk * (derived from the Kleene star), and the plus
48 sign + (Kleene cross).
50 ? The question mark indicates there is zero or one of the preceding element. For
51 example, colou?r matches both "color" and "colour".
53 * The asterisk indicates there is zero or more of the preceding element. For
54 example, ab*c matches "ac", "abc", "abbc", "abbbc", and so on.
56 + The plus sign indicates there is one or more of the preceding element. For
57 example, ab+c matches "abc", "abbc", "abbbc", and so on, but not "ac".
59 These constructions can be combined to form arbitrarily complex expressions,
60 much like one can construct arithmetical expressions from numbers and the
61 operations +, −, ×, and ÷. For example, H(ae?|ä)ndel and H(a|ae|ä)ndel are
62 both valid patterns which match the same strings as the earlier example,
67 . Matches any single character (many applications exclude newlines, and exactly
68 which characters are considered newlines is flavor-, character-encoding-, and
69 platform-specific, but it is safe to assume that the line feed character is
70 included). Within POSIX bracket expressions, the dot character matches a literal
71 dot. For example, a.c matches "abc", etc., but [a.c] matches only "a", ".", or
74 [ ] A bracket expression. Matches a single character that is contained within
75 the brackets. For example, [abc] matches "a", "b", or "c". [a-z] specifies a
76 range which matches any lowercase letter from "a" to "z". These forms can be
77 mixed: [abcx-z] matches "a", "b", "c", "x", "y", or "z", as does [a-cx-z]. The -
78 character is treated as a literal character if it is the last or the first
79 (after the ^) character within the brackets: [abc-], [-abc]. Note that backslash
80 escapes are not allowed. The ] character can be included in a bracket expression
81 if it is the first (after the ^) character: []abc].
83 [^ ] Matches a single character that is not contained within the brackets. For
84 example, [^abc] matches any character other than "a", "b", or "c". [^a-z]
85 matches any single character that is not a lowercase letter from "a" to "z".
86 Likewise, literal characters and ranges can be mixed.
88 ^ Matches the starting position within the string. In line-based tools, it
89 matches the starting position of any line.
91 $ Matches the ending position of the string or the position just before a
92 string-ending newline. In line-based tools, it matches the ending position of
95 ( ) Defines a marked subexpression. The string matched within the parentheses
96 can be recalled later (see the next entry, \n). A marked subexpression is also
97 called a block or capturing group. BRE mode requires \( \).
99 \n Matches what the nth marked subexpression matched, where n is a digit from 1
100 to 9. This construct is vaguely defined in the POSIX.2 standard. Some tools
101 allow referencing more than nine capturing groups.
103 * Matches the preceding element zero or more times. For example, ab*c matches
104 "ac", "abc", "abbbc", etc. [xyz]* matches "", "x", "y", "z", "zx", "zyx",
105 "xyzzy", and so on. (ab)* matches "", "ab", "abab", "ababab", and so on.
107 {m,n} Matches the preceding element at least m and not more than n times. For
108 example, a{3,5} matches only "aaa", "aaaa", and "aaaaa". This is not found in a
109 few older instances of regular expressions. BRE mode requires \{m,n\}.
112 .at matches any three-character string ending with "at", including "hat", "cat", and "bat".
113 [hc]at matches "hat" and "cat".
114 [^b]at matches all strings matched by .at except "bat".
115 [^hc]at matches all strings matched by .at other than "hat" and "cat".
116 ^[hc]at matches "hat" and "cat", but only at the beginning of the string or line.
117 [hc]at$ matches "hat" and "cat", but only at the end of the string or line.
118 \[.\] matches any single character surrounded by "[" and "]" since the brackets are escaped, for example: "[a]" and "[b]".
121 The meaning of metacharacters escaped with a backslash is reversed for some
122 characters in the POSIX Extended Regular Expression (ERE) syntax. With this
123 syntax, a backslash causes the metacharacter to be treated as a literal
124 character. So, for example, \( \) is now ( ) and \{ \} is now { }. Additionally,
125 support is removed for \n backreferences and the following metacharacters are
128 ? Matches the preceding element zero or one time. For example, ab?c matches only
131 + Matches the preceding element one or more times. For example, ab+c matches
132 "abc", "abbc", "abbbc", and so on, but not "ac".
134 | The choice (also known as alternation or set union) operator matches either
135 the expression before or the expression after the operator. For example, abc|def
136 matches "abc" or "def".
139 [hc]+at matches "hat", "cat", "hhat", "chat", "hcat", "cchchat", and so on, but not "at".
140 [hc]?at matches "hat", "cat", and "at".
141 [hc]*at matches "hat", "cat", "hhat", "chat", "hcat", "cchchat", "at", and so on.
142 cat|dog matches "cat" or "dog".
144 A bound is `{' followed by an unsigned decimal integer, possibly fol-
145 lowed by `,' possibly followed by another unsigned decimal integer,
146 always followed by `}'. The integers must lie between 0 and RE_DUP_MAX
147 (255-) inclusive, and if there are two of them, the first may not
148 exceed the second. An atom followed by a bound containing one integer
149 i and no comma matches a sequence of exactly i matches of the atom. An
150 atom followed by a bound containing one integer i and a comma matches a
151 sequence of i or more matches of the atom. An atom followed by a bound
152 containing two integers i and j matches a sequence of i through j
153 (inclusive) matches of the atom.
157 #define _REGEX_H_ /* never again */
158 /* ========= begin header generated by ./mkh ========= */
163 /* === regex2.h === */
164 // Don't depend on off_t, it can change depending on which macros where defined before system headers are included.
165 //typedef off_t regoff_t;
166 typedef long long regoff_t;
170 size_t re_nsub; /* number of parenthesized subexpressions */
171 const char *re_endp; /* end pointer for REG_PEND */
172 struct re_guts *re_g; /* none of your business :-) */
177 regoff_t rm_so; /* start of match */
178 regoff_t rm_eo; /* end of match */
181 /* === regcomp.c === */
182 extern int vogl_regcomp(regex_t *, const char *, int);
183 #define REG_BASIC 0000
184 #define REG_EXTENDED 0001
185 #define REG_ICASE 0002
186 #define REG_NOSUB 0004
187 #define REG_NEWLINE 0010
188 #define REG_NOSPEC 0020
189 #define REG_PEND 0040
190 #define REG_DUMP 0200
192 /* === regerror.c === */
194 #define REG_NOMATCH 1
196 #define REG_ECOLLATE 3
198 #define REG_EESCAPE 5
199 #define REG_ESUBREG 6
204 #define REG_ERANGE 11
205 #define REG_ESPACE 12
206 #define REG_BADRPT 13
208 #define REG_ASSERT 15
209 #define REG_INVARG 16
210 #define REG_ATOI 255 /* convert name to number (!) */
211 #define REG_ITOA 0400 /* convert number to name (!) */
212 extern size_t vogl_regerror(int, const regex_t *, char *, size_t);
214 /* === regexec.c === */
215 extern int vogl_regexec(const regex_t *, const char *, size_t, regmatch_t[], int);
216 #define REG_NOTBOL 00001
217 #define REG_NOTEOL 00002
218 #define REG_STARTEND 00004
219 #define REG_TRACE 00400 /* tracing of execution */
220 #define REG_LARGE 01000 /* force large representation */
221 #define REG_BACKR 02000 /* force use of backref code */
223 /* === regfree.c === */
224 extern void vogl_regfree(regex_t *);
229 /* ========= end header generated by ./mkh ========= */