1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2018 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
46 #define NLBLOCK cb /* Block containing newline information */
47 #define PSSTART start_pattern /* Field containing processed string start */
48 #define PSEND end_pattern /* Field containing processed string end */
50 #include "pcre2_internal.h"
52 /* In rare error cases debugging might require calling pcre2_printint(). */
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
64 /* Other debugging code can be enabled by these defines. */
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c) xdigitab[c]
76 #else /* Either 16-bit or 32-bit */
77 #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
83 #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
99 #define PUTOFFSET(s,p) \
100 { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102 { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
111 /* Macros for manipulating elements of the parsed pattern vector. */
113 #define META_CODE(x) (x & 0xffff0000u)
114 #define META_DATA(x) (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
117 /* Function definitions to allow mutual recursion */
119 #ifdef SUPPORT_UNICODE
121 add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t,
122 compile_block *, const uint32_t *, unsigned int);
126 compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
127 uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
128 compile_block *, PCRE2_SIZE *);
131 get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
135 set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
140 /*************************************************
141 * Code parameters and static tables *
142 *************************************************/
144 #define MAX_GROUP_NUMBER 65535u
145 #define MAX_REPEAT_COUNT 65535u
146 #define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
148 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
149 different ways in the different pattern scans. The parsing and group-
150 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
151 aligned for this. Having defined the size in code units, we set up
152 C16_WORK_SIZE as the number of elements in the 16-bit vector.
154 During the first compiling phase, when determining how much memory is required,
155 the regex is partly compiled into this space, but the compiled parts are
156 discarded as soon as they can be, so that hopefully there will never be an
157 overrun. The code does, however, check for an overrun, which can occur for
158 pathological patterns. The size of the workspace depends on LINK_SIZE because
159 the length of compiled items varies with this.
161 In the real compile phase, this workspace is not currently used. */
163 #define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */
165 #define C16_WORK_SIZE \
166 ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
168 /* A uint32_t vector is used for caching information about the size of
169 capturing groups, to improve performance. A default is created on the stack of
172 #define GROUPINFO_DEFAULT_SIZE 256
174 /* The overrun tests check for a slightly smaller size so that they detect the
175 overrun before it actually does run off the end of the data block. */
177 #define WORK_SIZE_SAFETY_MARGIN (100)
179 /* This value determines the size of the initial vector that is used for
180 remembering named groups during the pre-compile. It is allocated on the stack,
181 but if it is too small, it is expanded, in a similar way to the workspace. The
182 value is the number of slots in the list. */
184 #define NAMED_GROUP_LIST_SIZE 20
186 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
187 of uint32_t. For short patterns this lives on the stack, with this size. Heap
188 memory is used for longer patterns. */
190 #define PARSED_PATTERN_DEFAULT_SIZE 1024
192 /* Maximum length value to check against when making sure that the variable
193 that holds the compiled pattern length does not overflow. We make it a bit less
194 than INT_MAX to allow for adding in group terminating code units, so that we
195 don't have to check them every time. */
197 #define OFLOW_MAX (INT_MAX - 20)
199 /* Code values for parsed patterns, which are stored in a vector of 32-bit
200 unsigned ints. Values less than META_END are literal data values. The coding
201 for identifying the item is in the top 16-bits, leaving 16 bits for the
202 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
203 macros are used to manipulate parsed pattern elements.
205 NOTE: When these definitions are changed, the table of extra lengths for each
206 code (meta_extra_lengths, just below) must be updated to remain in step. */
208 #define META_END 0x80000000u /* End of pattern */
210 #define META_ALT 0x80010000u /* alternation */
211 #define META_ATOMIC 0x80020000u /* atomic group */
212 #define META_BACKREF 0x80030000u /* Back ref */
213 #define META_BACKREF_BYNAME 0x80040000u /* \k'name' */
214 #define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */
215 #define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */
216 #define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */
217 #define META_CAPTURE 0x80080000u /* Capturing parenthesis */
218 #define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */
219 #define META_CLASS 0x800a0000u /* start non-empty class */
220 #define META_CLASS_EMPTY 0x800b0000u /* empty class */
221 #define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */
222 #define META_CLASS_END 0x800d0000u /* end of non-empty class */
223 #define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */
224 #define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */
225 #define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */
226 #define META_COND_NAME 0x80110000u /* (?(<name>)... */
227 #define META_COND_NUMBER 0x80120000u /* (?(digits)... */
228 #define META_COND_RNAME 0x80130000u /* (?(R&name)... */
229 #define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
230 #define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
231 #define META_DOLLAR 0x80160000u /* $ metacharacter */
232 #define META_DOT 0x80170000u /* . metacharacter */
233 #define META_ESCAPE 0x80180000u /* \d and friends */
234 #define META_KET 0x80190000u /* closing parenthesis */
235 #define META_NOCAPTURE 0x801a0000u /* no capture parens */
236 #define META_OPTIONS 0x801b0000u /* (?i) and friends */
237 #define META_POSIX 0x801c0000u /* POSIX class item */
238 #define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */
239 #define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */
240 #define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
241 #define META_RECURSE 0x80200000u /* Recursion */
242 #define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
244 /* These must be kept together to make it easy to check that an assertion
245 is present where expected in a conditional group. */
247 #define META_LOOKAHEAD 0x80220000u /* (?= */
248 #define META_LOOKAHEADNOT 0x80230000u /* (?! */
249 #define META_LOOKBEHIND 0x80240000u /* (?<= */
250 #define META_LOOKBEHINDNOT 0x80250000u /* (?<! */
252 /* These must be kept in this order, with consecutive values, and the _ARG
253 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
256 #define META_MARK 0x80260000u /* (*MARK) */
257 #define META_ACCEPT 0x80270000u /* (*ACCEPT) */
258 #define META_FAIL 0x80280000u /* (*FAIL) */
259 #define META_COMMIT 0x80290000u /* These */
260 #define META_COMMIT_ARG 0x802a0000u /* pairs */
261 #define META_PRUNE 0x802b0000u /* must */
262 #define META_PRUNE_ARG 0x802c0000u /* be */
263 #define META_SKIP 0x802d0000u /* kept */
264 #define META_SKIP_ARG 0x802e0000u /* in */
265 #define META_THEN 0x802f0000u /* this */
266 #define META_THEN_ARG 0x80300000u /* order */
268 /* These must be kept in groups of adjacent 3 values, and all together. */
270 #define META_ASTERISK 0x80310000u /* * */
271 #define META_ASTERISK_PLUS 0x80320000u /* *+ */
272 #define META_ASTERISK_QUERY 0x80330000u /* *? */
273 #define META_PLUS 0x80340000u /* + */
274 #define META_PLUS_PLUS 0x80350000u /* ++ */
275 #define META_PLUS_QUERY 0x80360000u /* +? */
276 #define META_QUERY 0x80370000u /* ? */
277 #define META_QUERY_PLUS 0x80380000u /* ?+ */
278 #define META_QUERY_QUERY 0x80390000u /* ?? */
279 #define META_MINMAX 0x803a0000u /* {n,m} repeat */
280 #define META_MINMAX_PLUS 0x803b0000u /* {n,m}+ repeat */
281 #define META_MINMAX_QUERY 0x803c0000u /* {n,m}? repeat */
283 #define META_FIRST_QUANTIFIER META_ASTERISK
284 #define META_LAST_QUANTIFIER META_MINMAX_QUERY
286 /* Table of extra lengths for each of the meta codes. Must be kept in step with
287 the definitions above. For some items these values are a basic length to which
288 a variable amount has to be added. */
290 static unsigned char meta_extra_lengths[] = {
294 0, /* META_BACKREF - more if group is >= 10 */
295 1+SIZEOFFSET, /* META_BACKREF_BYNAME */
296 1, /* META_BIGVALUE */
297 3, /* META_CALLOUT_NUMBER */
298 3+SIZEOFFSET, /* META_CALLOUT_STRING */
299 0, /* META_CAPTURE */
300 0, /* META_CIRCUMFLEX */
302 0, /* META_CLASS_EMPTY */
303 0, /* META_CLASS_EMPTY_NOT */
304 0, /* META_CLASS_END */
305 0, /* META_CLASS_NOT */
306 0, /* META_COND_ASSERT */
307 SIZEOFFSET, /* META_COND_DEFINE */
308 1+SIZEOFFSET, /* META_COND_NAME */
309 1+SIZEOFFSET, /* META_COND_NUMBER */
310 1+SIZEOFFSET, /* META_COND_RNAME */
311 1+SIZEOFFSET, /* META_COND_RNUMBER */
312 3, /* META_COND_VERSION */
315 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
317 0, /* META_NOCAPTURE */
318 1, /* META_OPTIONS */
320 1, /* META_POSIX_NEG */
321 0, /* META_RANGE_ESCAPED */
322 0, /* META_RANGE_LITERAL */
323 SIZEOFFSET, /* META_RECURSE */
324 1+SIZEOFFSET, /* META_RECURSE_BYNAME */
325 0, /* META_LOOKAHEAD */
326 0, /* META_LOOKAHEADNOT */
327 SIZEOFFSET, /* META_LOOKBEHIND */
328 SIZEOFFSET, /* META_LOOKBEHINDNOT */
329 1, /* META_MARK - plus the string length */
333 1, /* META_COMMIT_ARG - plus the string length */
335 1, /* META_PRUNE_ARG - plus the string length */
337 1, /* META_SKIP_ARG - plus the string length */
339 1, /* META_THEN_ARG - plus the string length */
340 0, /* META_ASTERISK */
341 0, /* META_ASTERISK_PLUS */
342 0, /* META_ASTERISK_QUERY */
344 0, /* META_PLUS_PLUS */
345 0, /* META_PLUS_QUERY */
347 0, /* META_QUERY_PLUS */
348 0, /* META_QUERY_QUERY */
350 2, /* META_MINMAX_PLUS */
351 2 /* META_MINMAX_QUERY */
354 /* Types for skipping parts of a parsed pattern. */
356 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
358 /* Macro for setting individual bits in class bitmaps. It took some
359 experimenting to figure out how to stop gcc 5.3.0 from warning with
360 -Wconversion. This version gets a warning:
362 #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1 << ((b)&7))
364 Let's hope the apparently less efficient version isn't actually so bad if the
365 compiler is clever with identical subexpressions. */
367 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1 << ((b)&7)))
369 /* Private flags added to firstcu and reqcu. */
371 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
372 #define REQ_VARY (1 << 1) /* reqcu followed non-literal item */
373 /* Negative values for the firstcu and reqcu flags */
374 #define REQ_UNSET (-2) /* Not yet found anything */
375 #define REQ_NONE (-1) /* Found not fixed char */
377 /* These flags are used in the groupinfo vector. */
379 #define GI_SET_FIXED_LENGTH 0x80000000u
380 #define GI_NOT_FIXED_LENGTH 0x40000000u
381 #define GI_FIXED_LENGTH_MASK 0x0000ffffu
383 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
384 and is fast (a good compiler can turn it into a subtraction and unsigned
387 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
389 /* Table to identify hex digits. The tables in chartables are dependent on the
390 locale, and may mark arbitrary characters as digits. We want to recognize only
391 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
392 costs 256 bytes, but it is a lot faster than doing character value tests (at
393 least in some simple cases I timed), and in some applications one wants PCRE2
394 to compile efficiently as well as match efficiently. The value in the table is
395 the binary hex digit value, or 0xff for non-hex digits. */
397 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
401 static const uint8_t xdigitab[] =
403 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
404 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
405 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
406 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
407 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
408 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
409 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
410 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
411 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
412 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
413 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
414 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
415 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
416 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
417 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
418 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
419 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
420 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
421 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
422 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
423 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
424 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
425 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
426 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
427 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
428 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
429 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
430 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
431 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
432 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
433 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
434 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
438 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
440 static const uint8_t xdigitab[] =
442 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
443 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
444 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
445 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
446 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
447 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
448 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
449 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
450 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
451 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
452 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
453 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
454 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
455 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
456 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
457 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
458 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
459 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
460 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
461 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
462 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
463 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
464 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
465 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
466 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
467 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
468 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
469 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
470 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
471 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
472 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
473 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
477 /* Table for handling alphanumeric escaped characters. Positive returns are
478 simple data values; negative values are for special things like \d and so on.
479 Zero means further processing is needed (for things like \x), or the escape is
482 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
483 in UTF-8 mode. It runs from '0' to 'z'. */
486 #define ESCAPES_FIRST CHAR_0
487 #define ESCAPES_LAST CHAR_z
488 #define UPPER_CASE(c) (c-32)
490 static const short int escapes[] = {
496 CHAR_COLON, CHAR_SEMICOLON,
497 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
498 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
499 CHAR_COMMERCIAL_AT, -ESC_A,
512 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
513 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
514 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
515 CHAR_GRAVE_ACCENT, CHAR_BEL,
533 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
534 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
535 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
536 because it is defined as 'a', which of course picks up the ASCII value. */
538 #if 'a' == 0x81 /* Check for a real EBCDIC environment */
539 #define ESCAPES_FIRST CHAR_a
540 #define ESCAPES_LAST CHAR_9
541 #define UPPER_CASE(c) (c+64)
542 #else /* Testing in an ASCII environment */
543 #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
544 #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
545 #define UPPER_CASE(c) (c-32)
548 static const short int escapes[] = {
549 /* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0,
550 /* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0,
551 /* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p,
552 /* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0,
553 /* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0,
554 /* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0,
555 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
556 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
557 /* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G,
558 /* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0,
559 /* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P,
560 /* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0,
561 /* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X,
562 /* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0,
563 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
567 /* We also need a table of characters that may follow \c in an EBCDIC
568 environment for characters 0-31. */
570 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
575 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
576 searched linearly. Put all the names into a single string, in order to reduce
577 the number of relocations when a shared library is dynamically linked. The
578 string is built from string macros so that it works in UTF-8 mode on EBCDIC
581 typedef struct verbitem {
582 unsigned int len; /* Length of verb name */
583 uint32_t meta; /* Base META_ code */
584 int has_arg; /* Argument requirement */
587 static const char verbnames[] =
588 "\0" /* Empty name is a shorthand for MARK */
598 static const verbitem verbs[] = {
599 { 0, META_MARK, +1 }, /* > 0 => must have an argument */
600 { 4, META_MARK, +1 },
601 { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */
602 { 1, META_FAIL, -1 },
603 { 4, META_FAIL, -1 },
604 { 6, META_COMMIT, 0 },
605 { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */
610 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
612 /* Verb opcodes, indexed by their META code offset from META_MARK. */
614 static const uint32_t verbops[] = {
615 OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
616 OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
618 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
620 static uint32_t chartypeoffset[] = {
621 OP_STAR - OP_STAR, OP_STARI - OP_STAR,
622 OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
624 /* Tables of names of POSIX character classes and their lengths. The names are
625 now all in a single string, to reduce the number of relocations when a shared
626 library is dynamically loaded. The list of lengths is terminated by a zero
627 length entry. The first three must be alpha, lower, upper, as this is assumed
628 for handling case independence. The indices for graph, print, and punct are
629 needed, so identify them. */
631 static const char posix_names[] =
632 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
633 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
634 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
635 STRING_word0 STRING_xdigit;
637 static const uint8_t posix_name_lengths[] = {
638 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
644 /* Table of class bit maps for each POSIX class. Each class is formed from a
645 base map, with an optional addition or removal of another map. Then, for some
646 classes, there is some additional tweaking: for [:blank:] the vertical space
647 characters are removed, and for [:alpha:] and [:alnum:] the underscore
648 character is removed. The triples in the table consist of the base map offset,
649 second map offset or -1 if no second map, and a non-negative value for map
650 addition or a negative value for map subtraction (if there are two maps). The
651 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
652 remove vertical space characters, 2 => remove underscore. */
654 static const int posix_class_maps[] = {
655 cbit_word, cbit_digit, -2, /* alpha */
656 cbit_lower, -1, 0, /* lower */
657 cbit_upper, -1, 0, /* upper */
658 cbit_word, -1, 2, /* alnum - word without underscore */
659 cbit_print, cbit_cntrl, 0, /* ascii */
660 cbit_space, -1, 1, /* blank - a GNU extension */
661 cbit_cntrl, -1, 0, /* cntrl */
662 cbit_digit, -1, 0, /* digit */
663 cbit_graph, -1, 0, /* graph */
664 cbit_print, -1, 0, /* print */
665 cbit_punct, -1, 0, /* punct */
666 cbit_space, -1, 0, /* space */
667 cbit_word, -1, 0, /* word - a Perl extension */
668 cbit_xdigit,-1, 0 /* xdigit */
671 #ifdef SUPPORT_UNICODE
673 /* The POSIX class Unicode property substitutes that are used in UCP mode must
674 be in the order of the POSIX class names, defined above. */
676 static int posix_substitutes[] = {
677 PT_GC, ucp_L, /* alpha */
678 PT_PC, ucp_Ll, /* lower */
679 PT_PC, ucp_Lu, /* upper */
680 PT_ALNUM, 0, /* alnum */
681 -1, 0, /* ascii, treat as non-UCP */
682 -1, 1, /* blank, treat as \h */
683 PT_PC, ucp_Cc, /* cntrl */
684 PT_PC, ucp_Nd, /* digit */
685 PT_PXGRAPH, 0, /* graph */
686 PT_PXPRINT, 0, /* print */
687 PT_PXPUNCT, 0, /* punct */
688 PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */
689 PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */
690 -1, 0 /* xdigit, treat as non-UCP */
692 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
693 #endif /* SUPPORT_UNICODE */
695 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
698 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
699 (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
700 PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_NO_START_OPTIMIZE| \
701 PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
703 #define PUBLIC_COMPILE_OPTIONS \
704 (PUBLIC_LITERAL_COMPILE_OPTIONS| \
705 PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
706 PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
707 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
708 PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
709 PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
710 PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
712 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
713 (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
715 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
716 (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
717 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL)
719 /* Compile time error code numbers. They are given names so that they can more
720 easily be tracked. When a new number is added, the tables called eint1 and
721 eint2 in pcre2posix.c may need to be updated, and a new error text must be
722 added to compile_error_texts in pcre2_error.c. */
724 enum { ERR0 = COMPILE_ERROR_BASE,
725 ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
726 ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
727 ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
728 ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
729 ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
730 ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
731 ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
732 ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
733 ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
734 ERR91, ERR92, ERR93, ERR94 };
736 /* This is a table of start-of-pattern options such as (*UTF) and settings such
737 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
738 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
739 generic and always supported. */
741 enum { PSO_OPT, /* Value is an option bit */
742 PSO_FLG, /* Value is a flag bit */
743 PSO_NL, /* Value is a newline type */
744 PSO_BSR, /* Value is a \R type */
745 PSO_LIMH, /* Read integer value for heap limit */
746 PSO_LIMM, /* Read integer value for match limit */
747 PSO_LIMD }; /* Read integer value for depth limit */
756 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
758 static pso pso_list[] = {
759 { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
760 { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
761 { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
762 { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
763 { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
764 { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
765 { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
766 { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
767 { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
768 { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
769 { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
770 { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
771 { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
772 { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
773 { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
774 { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
775 { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
776 { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },
777 { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
778 { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
779 { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
782 /* This table is used when converting repeating opcodes into possessified
783 versions as a result of an explicit possessive quantifier such as ++. A zero
784 value means there is no possessified version - in those cases the item in
785 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
786 because all relevant opcodes are less than that. */
788 static const uint8_t opcode_possessify[] = {
789 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
790 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
793 OP_POSSTAR, 0, /* STAR, MINSTAR */
794 OP_POSPLUS, 0, /* PLUS, MINPLUS */
795 OP_POSQUERY, 0, /* QUERY, MINQUERY */
796 OP_POSUPTO, 0, /* UPTO, MINUPTO */
798 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
800 OP_POSSTARI, 0, /* STARI, MINSTARI */
801 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
802 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
803 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
805 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
807 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
808 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
809 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
810 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
812 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
814 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
815 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
816 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
817 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
819 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
821 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
822 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
823 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
824 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
826 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
828 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
829 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
830 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
831 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
832 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
834 0, 0, 0, /* CLASS, NCLASS, XCLASS */
835 0, 0, /* REF, REFI */
836 0, 0, /* DNREF, DNREFI */
837 0, 0 /* RECURSE, CALLOUT */
841 #ifdef DEBUG_SHOW_PARSED
842 /*************************************************
843 * Show the parsed pattern for debugging *
844 *************************************************/
846 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
849 static void show_parsed(compile_block *cb)
851 uint32_t *pptr = cb->parsed_pattern;
859 uint32_t meta_arg = META_DATA(*pptr);
861 fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
863 if (*pptr < META_END)
865 if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
869 else switch (META_CODE(*pptr++))
872 fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
876 fprintf(stderr, "META_END\n");
880 fprintf(stderr, "META_CAPTURE %d", meta_arg);
884 GETOFFSET(offset, pptr);
885 fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
890 offset = cb->small_ref_offset[meta_arg];
892 GETOFFSET(offset, pptr);
893 fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
897 if (meta_arg == ESC_P || meta_arg == ESC_p)
899 uint32_t ptype = *pptr >> 16;
900 uint32_t pvalue = *pptr++ & 0xffff;
901 fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
907 /* There's just one escape we might have here that isn't negated in the
909 if (meta_arg == ESC_g) cc = CHAR_g;
910 else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
912 if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
914 if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
915 fprintf(stderr, "META \\%c", cc);
922 if (max != REPEAT_UNLIMITED)
923 fprintf(stderr, "META {%d,%d}", min, max);
925 fprintf(stderr, "META {%d,}", min);
928 case META_MINMAX_QUERY:
931 if (max != REPEAT_UNLIMITED)
932 fprintf(stderr, "META {%d,%d}?", min, max);
934 fprintf(stderr, "META {%d,}?", min);
937 case META_MINMAX_PLUS:
940 if (max != REPEAT_UNLIMITED)
941 fprintf(stderr, "META {%d,%d}+", min, max);
943 fprintf(stderr, "META {%d,}+", min);
946 case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
947 case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
948 case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
949 case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
950 case META_DOT: fprintf(stderr, "META_DOT"); break;
951 case META_ASTERISK: fprintf(stderr, "META *"); break;
952 case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
953 case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
954 case META_PLUS: fprintf(stderr, "META +"); break;
955 case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
956 case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
957 case META_QUERY: fprintf(stderr, "META ?"); break;
958 case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
959 case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
961 case META_ATOMIC: fprintf(stderr, "META (?>"); break;
962 case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
963 case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
964 case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
965 case META_KET: fprintf(stderr, "META )"); break;
966 case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
968 case META_CLASS: fprintf(stderr, "META ["); break;
969 case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
970 case META_CLASS_END: fprintf(stderr, "META ]"); break;
971 case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
972 case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
974 case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
975 case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
977 case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
978 case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
980 case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
981 case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
982 case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
983 case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
984 case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
985 case META_THEN: fprintf(stderr, "META (*THEN)"); break;
987 case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
989 case META_LOOKBEHIND:
990 fprintf(stderr, "META (?<= %d offset=", meta_arg);
991 GETOFFSET(offset, pptr);
992 fprintf(stderr, "%zd", offset);
995 case META_LOOKBEHINDNOT:
996 fprintf(stderr, "META (?<! %d offset=", meta_arg);
997 GETOFFSET(offset, pptr);
998 fprintf(stderr, "%zd", offset);
1001 case META_CALLOUT_NUMBER:
1002 fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1007 case META_CALLOUT_STRING:
1009 uint32_t patoffset = *pptr++; /* Offset of next pattern item */
1010 uint32_t patlength = *pptr++; /* Length of next pattern item */
1011 fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1012 GETOFFSET(offset, pptr);
1013 fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1017 case META_RECURSE_BYNAME:
1018 fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1019 GETOFFSET(offset, pptr);
1020 fprintf(stderr, "%zd", offset);
1023 case META_BACKREF_BYNAME:
1024 fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1025 GETOFFSET(offset, pptr);
1026 fprintf(stderr, "%zd", offset);
1029 case META_COND_NUMBER:
1030 fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1031 GETOFFSET(offset, pptr);
1032 fprintf(stderr, "%zd", offset);
1036 case META_COND_DEFINE:
1037 fprintf(stderr, "META (?(DEFINE) offset=");
1038 GETOFFSET(offset, pptr);
1039 fprintf(stderr, "%zd", offset);
1042 case META_COND_VERSION:
1043 fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1044 fprintf(stderr, "%d.", *pptr++);
1045 fprintf(stderr, "%d)", *pptr++);
1048 case META_COND_NAME:
1049 fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1050 GETOFFSET(offset, pptr);
1051 fprintf(stderr, "%zd", offset);
1054 case META_COND_RNAME:
1055 fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1056 GETOFFSET(offset, pptr);
1057 fprintf(stderr, "%zd", offset);
1060 /* This is kept as a name, because it might be. */
1062 case META_COND_RNUMBER:
1063 fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1064 GETOFFSET(offset, pptr);
1065 fprintf(stderr, "%zd", offset);
1069 fprintf(stderr, "META (*MARK:");
1072 case META_COMMIT_ARG:
1073 fprintf(stderr, "META (*COMMIT:");
1076 case META_PRUNE_ARG:
1077 fprintf(stderr, "META (*PRUNE:");
1081 fprintf(stderr, "META (*SKIP:");
1085 fprintf(stderr, "META (*THEN:");
1088 for (i = 0; i < length; i++)
1090 uint32_t cc = *pptr++;
1091 if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1092 else fprintf(stderr, "\\x{%x}", cc);
1094 fprintf(stderr, ") length=%u", length);
1097 fprintf(stderr, "\n");
1101 #endif /* DEBUG_SHOW_PARSED */
1105 /*************************************************
1106 * Copy compiled code *
1107 *************************************************/
1109 /* Compiled JIT code cannot be copied, so the new compiled block has no
1110 associated JIT data. */
1112 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1113 pcre2_code_copy(const pcre2_code *code)
1115 PCRE2_SIZE* ref_count;
1116 pcre2_code *newcode;
1118 if (code == NULL) return NULL;
1119 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1120 if (newcode == NULL) return NULL;
1121 memcpy(newcode, code, code->blocksize);
1122 newcode->executable_jit = NULL;
1124 /* If the code is one that has been deserialized, increment the reference count
1125 in the decoded tables. */
1127 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1129 ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
1138 /*************************************************
1139 * Copy compiled code and character tables *
1140 *************************************************/
1142 /* Compiled JIT code cannot be copied, so the new compiled block has no
1143 associated JIT data. This version of code_copy also makes a separate copy of
1144 the character tables. */
1146 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1147 pcre2_code_copy_with_tables(const pcre2_code *code)
1149 PCRE2_SIZE* ref_count;
1150 pcre2_code *newcode;
1153 if (code == NULL) return NULL;
1154 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1155 if (newcode == NULL) return NULL;
1156 memcpy(newcode, code, code->blocksize);
1157 newcode->executable_jit = NULL;
1159 newtables = code->memctl.malloc(tables_length + sizeof(PCRE2_SIZE),
1160 code->memctl.memory_data);
1161 if (newtables == NULL)
1163 code->memctl.free((void *)newcode, code->memctl.memory_data);
1166 memcpy(newtables, code->tables, tables_length);
1167 ref_count = (PCRE2_SIZE *)(newtables + tables_length);
1170 newcode->tables = newtables;
1171 newcode->flags |= PCRE2_DEREF_TABLES;
1177 /*************************************************
1178 * Free compiled code *
1179 *************************************************/
1181 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1182 pcre2_code_free(pcre2_code *code)
1184 PCRE2_SIZE* ref_count;
1188 if (code->executable_jit != NULL)
1189 PRIV(jit_free)(code->executable_jit, &code->memctl);
1191 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1193 /* Decoded tables belong to the codes after deserialization, and they must
1194 be freed when there are no more reference to them. The *ref_count should
1197 ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
1201 if (*ref_count == 0)
1202 code->memctl.free((void *)code->tables, code->memctl.memory_data);
1206 code->memctl.free(code, code->memctl.memory_data);
1212 /*************************************************
1213 * Read a number, possibly signed *
1214 *************************************************/
1216 /* This function is used to read numbers in the pattern. The initial pointer
1217 must be the sign or first digit of the number. When relative values (introduced
1218 by + or -) are allowed, they are relative group numbers, and the result must be
1222 ptrptr points to the character pointer variable
1223 ptrend points to the end of the input string
1224 allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
1225 max_value the largest number allowed
1226 max_error the error to give for an over-large number
1227 intptr where to put the result
1228 errcodeptr where to put an error code
1230 Returns: TRUE - a number was read
1231 FALSE - errorcode == 0 => no number was found
1232 errorcode != 0 => an error occurred
1236 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1237 uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1241 PCRE2_SPTR ptr = *ptrptr;
1246 if (allow_sign >= 0 && ptr < ptrend)
1248 if (*ptr == CHAR_PLUS)
1251 max_value -= allow_sign;
1254 else if (*ptr == CHAR_MINUS)
1261 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1262 while (ptr < ptrend && IS_DIGIT(*ptr))
1264 n = n * 10 + *ptr++ - CHAR_0;
1267 *errorcodeptr = max_error;
1272 if (allow_sign >= 0 && sign != 0)
1276 *errorcodeptr = ERR26; /* +0 and -0 are not allowed */
1280 if (sign > 0) n += allow_sign;
1281 else if ((int)n > allow_sign)
1283 *errorcodeptr = ERR15; /* Non-existent subpattern */
1286 else n = allow_sign + 1 - n;
1299 /*************************************************
1300 * Read repeat counts *
1301 *************************************************/
1303 /* Read an item of the form {n,m} and return the values if non-NULL pointers
1304 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1305 larger value is used for "unlimited". We have to use signed arguments for
1306 read_number() because it is capable of returning a signed value.
1309 ptrptr points to pointer to character after'{'
1310 ptrend pointer to end of input
1311 minp if not NULL, pointer to int for min
1312 maxp if not NULL, pointer to int for max (-1 if no max)
1313 returned as -1 if no max
1314 errorcodeptr points to error code variable
1316 Returns: FALSE if not a repeat quantifier, errorcode set zero
1317 FALSE on error, with errorcode set non-zero
1318 TRUE on success, with pointer updated to point after '}'
1322 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1323 uint32_t *maxp, int *errorcodeptr)
1325 PCRE2_SPTR p = *ptrptr;
1328 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1330 /* NB read_number() initializes the error code to zero. The only error is for a
1331 number that is too big. */
1333 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1336 if (p >= ptrend) goto EXIT;
1338 if (*p == CHAR_RIGHT_CURLY_BRACKET)
1346 if (*p++ != CHAR_COMMA || p >= ptrend) goto EXIT;
1347 if (*p != CHAR_RIGHT_CURLY_BRACKET)
1349 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
1350 errorcodeptr) || p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1354 *errorcodeptr = ERR4;
1362 if (minp != NULL) *minp = (uint32_t)min;
1363 if (maxp != NULL) *maxp = (uint32_t)max;
1365 /* Update the pattern pointer on success, or after an error, but not when
1366 the result is "not a repeat quantifier". */
1369 if (yield || *errorcodeptr != 0) *ptrptr = p;
1378 /*************************************************
1380 *************************************************/
1382 /* This function is called when a \ has been encountered. It either returns a
1383 positive value for a simple escape such as \d, or 0 for a data character, which
1384 is placed in chptr. A backreference to group n is returned as negative n. On
1385 entry, ptr is pointing at the character after \. On exit, it points after the
1386 final code unit of the escape sequence.
1388 This function is also called from pcre2_substitute() to handle escape sequences
1389 in replacement strings. In this case, the cb argument is NULL, and in the case
1390 of escapes that have further processing, only sequences that define a data
1391 character are recognised. The isclass argument is not relevant; the options
1392 argument is the final value of the compiled pattern's options.
1395 ptrptr points to the input position pointer
1396 ptrend points to the end of the input
1397 chptr points to a returned data character
1398 errorcodeptr points to the errorcode variable (containing zero)
1399 options the current options bits
1400 isclass TRUE if inside a character class
1401 cb compile data block
1403 Returns: zero => a data character
1404 positive => a special escape sequence
1405 negative => a numerical back reference
1406 on error, errorcodeptr is set non-zero
1410 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1411 int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb)
1413 BOOL utf = (options & PCRE2_UTF) != 0;
1414 PCRE2_SPTR ptr = *ptrptr;
1419 /* If backslash is at the end of the string, it's an error. */
1423 *errorcodeptr = ERR1;
1427 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1428 *errorcodeptr = 0; /* Be optimistic */
1430 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1431 value test saves a memory lookup for code points outside the alphanumeric
1432 range. Otherwise, do a table lookup. A non-zero result is something that can be
1433 returned immediately. Otherwise further processing is required. */
1435 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
1437 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1439 if (i > 0) c = (uint32_t)i; else /* Positive is a data character */
1441 escape = -i; /* Else return a special escape */
1442 if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1443 cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
1445 /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1446 Unicode code points, as well as plain \N for "not newline". PCRE does not
1447 support \N{name}. However, it does support quantification such as \N{2,3},
1448 so if \N{ is not followed by U+dddd we check for a quantifier. */
1450 if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1452 PCRE2_SPTR p = ptr + 1;
1454 /* \N{U+ can be handled by the \x{ code. However, this construction is
1455 not valid in EBCDIC environments because it specifies a Unicode
1456 character, not a codepoint in the local code. For example \N{U+0041}
1457 must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1458 casing semantics for the entire pattern, so allow it only in UTF (i.e.
1461 if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1464 *errorcodeptr = ERR93;
1469 escape = 0; /* Not a fancy escape after all */
1472 else *errorcodeptr = ERR93;
1476 /* Give an error if what follows is not a quantifier, but don't override
1477 an error set by the quantifier reader (e.g. number overflow). */
1481 if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1483 *errorcodeptr = ERR37;
1489 /* Escapes that need further processing, including those that are unknown.
1490 When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u
1491 when BSUX is set). */
1499 /* Filter calls from pcre2_substitute(). */
1501 if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x &&
1502 (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0))
1504 *errorcodeptr = ERR3;
1510 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1516 *errorcodeptr = ERR37;
1519 /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated
1520 specially, \u must be followed by four hex digits. Otherwise it is a
1521 lowercase u letter. */
1524 if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else
1527 if (ptrend - ptr < 4) break; /* Less than 4 chars */
1528 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1529 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1530 cc = (cc << 4) | xc;
1531 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
1532 cc = (cc << 4) | xc;
1533 if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
1538 if (c > 0x10ffffU) *errorcodeptr = ERR77;
1540 if (c >= 0xd800 && c <= 0xdfff &&
1541 (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1542 *errorcodeptr = ERR73;
1544 else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1548 /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an
1549 upper case letter. */
1552 if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37;
1555 /* In a character class, \g is just a literal "g". Outside a character
1556 class, \g must be followed by one of a number of specific things:
1558 (1) A number, either plain or braced. If positive, it is an absolute
1559 backreference. If negative, it is a relative backreference. This is a Perl
1562 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1563 is part of Perl's movement towards a unified syntax for back references. As
1564 this is synonymous with \k{name}, we fudge it up by pretending it really
1567 (3) For Oniguruma compatibility we also support \g followed by a name or a
1568 number either in angle brackets or in single quotes. However, these are
1569 (possibly recursive) subroutine calls, _not_ backreferences. We return
1572 Summary: Return a negative number for a numerical back reference, ESC_k for
1573 a named back reference, and ESC_g for a named or numbered subroutine call.
1581 *errorcodeptr = ERR57;
1585 if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1591 /* If there is a brace delimiter, try to read a numerical reference. If
1592 there isn't one, assume we have a name and treat it as \k. */
1594 if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1596 PCRE2_SPTR p = ptr + 1;
1597 if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1600 if (*errorcodeptr == 0) escape = ESC_k; /* No number found */
1603 if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1605 *errorcodeptr = ERR57;
1611 /* Read an undelimited number */
1615 if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1618 if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */
1625 *errorcodeptr = ERR15;
1632 /* The handling of escape sequences consisting of a string of digits
1633 starting with one that is not zero is not straightforward. Perl has changed
1634 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1635 recommended to avoid the ambiguities in the old syntax.
1637 Outside a character class, the digits are read as a decimal number. If the
1638 number is less than 10, or if there are that many previous extracting left
1639 brackets, it is a back reference. Otherwise, up to three octal digits are
1640 read to form an escaped character code. Thus \123 is likely to be octal 123
1641 (cf \0123, which is octal 012 followed by the literal 3).
1643 Inside a character class, \ followed by a digit is always either a literal
1644 8 or 9 or an octal number. */
1646 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1647 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1652 ptr--; /* Back to the digit */
1653 if (!read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, ERR61, &s,
1657 /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1658 are octal escapes if there are not that many previous captures. */
1660 if (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount)
1662 if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1663 else escape = -s; /* Indicates a back reference */
1666 ptr = oldptr; /* Put the pointer back and fall through */
1669 /* Handle a digit following \ when the number is not a back reference, or
1670 we are within a character class. If the first digit is 8 or 9, Perl used to
1671 generate a binary zero and then treat the digit as a following literal. At
1672 least by Perl 5.18 this changed so as not to insert the binary zero. */
1674 if (c >= CHAR_8) break;
1678 /* \0 always starts an octal number, but we may drop through to here with a
1679 larger first octal digit. The original code used just to take the least
1680 significant 8 bits of octal numbers (I think this is what early Perls used
1681 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1682 but no more than 3 octal digits. */
1686 while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1687 c = c * 8 + *ptr++ - CHAR_0;
1688 #if PCRE2_CODE_UNIT_WIDTH == 8
1689 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1693 /* \o is a relatively new Perl feature, supporting a more general way of
1694 specifying character codes in octal. The only supported form is \o{ddd}. */
1697 if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1700 *errorcodeptr = ERR55;
1702 else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1703 *errorcodeptr = ERR78;
1708 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1711 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1712 #if PCRE2_CODE_UNIT_WIDTH == 32
1713 if (c >= 0x20000000l) { overflow = TRUE; break; }
1715 c = (c << 3) + (cc - CHAR_0);
1716 #if PCRE2_CODE_UNIT_WIDTH == 8
1717 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1718 #elif PCRE2_CODE_UNIT_WIDTH == 16
1719 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1720 #elif PCRE2_CODE_UNIT_WIDTH == 32
1721 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1726 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1727 *errorcodeptr = ERR34;
1729 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1731 if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL ||
1732 (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0))
1735 *errorcodeptr = ERR73;
1741 *errorcodeptr = ERR64;
1746 /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by
1747 two hexadecimal digits. Otherwise it is a lowercase x letter. */
1750 if ((options & PCRE2_ALT_BSUX) != 0)
1753 if (ptrend - ptr < 2) break; /* Less than 2 characters */
1754 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1755 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1758 } /* End PCRE2_ALT_BSUX handling */
1760 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1761 greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1762 digits. If not, { used to be treated as a data character. However, Perl
1763 seems to read hex digits up to the first non-such, and ignore the rest, so
1764 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1765 now gives an error. */
1769 if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1774 if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1776 *errorcodeptr = ERR78;
1782 while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
1785 if (c == 0 && cc == 0) continue; /* Leading zeroes */
1786 #if PCRE2_CODE_UNIT_WIDTH == 32
1787 if (c >= 0x10000000l) { overflow = TRUE; break; }
1790 if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
1799 while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1800 *errorcodeptr = ERR34;
1802 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1804 if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL ||
1805 (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0))
1808 *errorcodeptr = ERR73;
1812 /* If the sequence of hex digits does not end with '}', give an error.
1813 We used just to recognize this construct and fall through to the normal
1814 \x handling, but nowadays Perl gives an error, which seems much more
1815 sensible, so we do too. */
1820 *errorcodeptr = ERR67;
1822 } /* End of \x{} processing */
1824 /* Read a up to two hex digits after \x */
1829 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1832 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1835 } /* End of \xdd handling */
1836 } /* End of Perl-style \x handling */
1839 /* The handling of \c is different in ASCII and EBCDIC environments. In an
1840 ASCII (or Unicode) environment, an error is given if the character
1841 following \c is not a printable ASCII character. Otherwise, the following
1842 character is upper-cased if it is a letter, and after that the 0x40 bit is
1843 flipped. The result is the value of the escape.
1845 In an EBCDIC environment the handling of \c is compatible with the
1846 specification in the perlebcdic document. The following character must be
1847 a letter or one of small number of special characters. These provide a
1848 means of defining the character values 0-31.
1850 For testing the EBCDIC handling of \c in an ASCII environment, recognize
1851 the EBCDIC value of 'c' explicitly. */
1853 #if defined EBCDIC && 'a' != 0x81
1860 *errorcodeptr = ERR2;
1864 if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
1866 /* Handle \c in an ASCII/Unicode environment. */
1868 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1869 if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
1871 *errorcodeptr = ERR68;
1876 /* Handle \c in an EBCDIC environment. The special case \c? is converted to
1877 255 (0xff) or 95 (0x5f) if other character suggest we are using th POSIX-BC
1878 encoding. (This is the way Perl indicates that it handles \c?.) The other
1879 valid sequences correspond to a list of specific characters. */
1882 if (c == CHAR_QUESTION_MARK)
1883 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1886 for (i = 0; i < 32; i++)
1888 if (c == ebcdic_escape_c[i]) break;
1890 if (i < 32) c = i; else *errorcodeptr = ERR68;
1897 /* Any other alphanumeric following \ is an error. Perl gives an error only
1898 if in warning mode, but PCRE doesn't have a warning mode. */
1901 *errorcodeptr = ERR3;
1902 *ptrptr = ptr - 1; /* Point to the character at fault */
1907 /* Set the pointer to the next character before returning. */
1916 #ifdef SUPPORT_UNICODE
1917 /*************************************************
1918 * Handle \P and \p *
1919 *************************************************/
1921 /* This function is called after \P or \p has been encountered, provided that
1922 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
1923 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
1924 after the final code unit of the escape sequence.
1927 ptrptr the pattern position pointer
1928 negptr a boolean that is set TRUE for negation else FALSE
1929 ptypeptr an unsigned int that is set to the type value
1930 pdataptr an unsigned int that is set to the detailed property value
1931 errorcodeptr the error code variable
1934 Returns: TRUE if the type value was found, or FALSE for an invalid type
1938 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
1939 uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
1942 PCRE2_SIZE i, bot, top;
1943 PCRE2_SPTR ptr = *ptrptr;
1944 PCRE2_UCHAR name[32];
1946 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
1950 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1953 if (c == CHAR_LEFT_CURLY_BRACKET)
1955 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
1956 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
1961 for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
1963 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
1965 if (c == CHAR_NUL) goto ERROR_RETURN;
1966 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1969 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1973 /* Otherwise there is just one following character, which must be an ASCII
1976 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
1981 else goto ERROR_RETURN;
1985 /* Search for a recognized property name using binary chop. */
1988 top = PRIV(utt_size);
1993 i = (bot + top) >> 1;
1994 r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1997 *ptypeptr = PRIV(utt)[i].type;
1998 *pdataptr = PRIV(utt)[i].value;
2001 if (r > 0) bot = i + 1; else top = i;
2003 *errorcodeptr = ERR47; /* Unrecognized name */
2006 ERROR_RETURN: /* Malformed \P or \p */
2007 *errorcodeptr = ERR46;
2015 /*************************************************
2016 * Check for POSIX class syntax *
2017 *************************************************/
2019 /* This function is called when the sequence "[:" or "[." or "[=" is
2020 encountered in a character class. It checks whether this is followed by a
2021 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2022 reach an unescaped ']' without the special preceding character, return FALSE.
2024 Originally, this function only recognized a sequence of letters between the
2025 terminators, but it seems that Perl recognizes any sequence of characters,
2026 though of course unknown POSIX names are subsequently rejected. Perl gives an
2027 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2028 didn't consider this to be a POSIX class. Likewise for [:1234:].
2030 The problem in trying to be exactly like Perl is in the handling of escapes. We
2031 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2032 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2033 below handles the special cases \\ and \], but does not try to do any other
2034 escape processing. This makes it different from Perl for cases such as
2035 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2036 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2037 when Perl does, I think.
2039 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2040 It seems that the appearance of a nested POSIX class supersedes an apparent
2041 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2042 a digit. This is handled by returning FALSE if the start of a new group with
2043 the same terminator is encountered, since the next closing sequence must close
2044 the nested group, not the outer one.
2046 In Perl, unescaped square brackets may also appear as part of class names. For
2047 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2048 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2049 seem right at all. PCRE does not allow closing square brackets in POSIX class
2053 ptr pointer to the character after the initial [ (colon, dot, equals)
2054 ptrend pointer to the end of the pattern
2055 endptr where to return a pointer to the terminating ':', '.', or '='
2057 Returns: TRUE or FALSE
2061 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2063 PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
2064 terminator = *ptr++; /* compiler warns about "non-constant" initializer. */
2066 for (; ptrend - ptr >= 2; ptr++)
2068 if (*ptr == CHAR_BACKSLASH &&
2069 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2072 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2073 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2075 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2087 /*************************************************
2088 * Check POSIX class name *
2089 *************************************************/
2091 /* This function is called to check the name given in a POSIX-style class entry
2095 ptr points to the first letter
2096 len the length of the name
2098 Returns: a value representing the name, or -1 if unknown
2102 check_posix_name(PCRE2_SPTR ptr, int len)
2104 const char *pn = posix_names;
2106 while (posix_name_lengths[yield] != 0)
2108 if (len == posix_name_lengths[yield] &&
2109 PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2110 pn += posix_name_lengths[yield] + 1;
2118 /*************************************************
2119 * Read a subpattern or VERB name *
2120 *************************************************/
2122 /* This function is called from parse_regex() below whenever it needs to read
2123 the name of a subpattern or a (*VERB). The initial pointer must be to the
2124 character before the name. If that character is '*' we are reading a verb name.
2125 The pointer is updated to point after the name, for a VERB, or after tha name's
2126 terminator for a subpattern name. Returning both the offset and the name
2127 pointer is redundant information, but some callers use one and some the other,
2128 so it is simplest just to return both.
2131 ptrptr points to the character pointer variable
2132 ptrend points to the end of the input string
2133 terminator the terminator of a subpattern name must be this
2134 offsetptr where to put the offset from the start of the pattern
2135 nameptr where to put a pointer to the name in the input
2136 namelenptr where to put the length of the name
2137 errcodeptr where to put an error code
2138 cb pointer to the compile data block
2140 Returns: TRUE if a name was read
2141 FALSE otherwise, with error code set
2145 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t terminator,
2146 PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2147 int *errorcodeptr, compile_block *cb)
2149 PCRE2_SPTR ptr = *ptrptr;
2150 BOOL is_verb = (*ptr == CHAR_ASTERISK);
2151 uint32_t namelen = 0;
2152 uint32_t ctype = is_verb? ctype_letter : ctype_word;
2154 if (++ptr >= ptrend)
2156 *errorcodeptr = is_verb? ERR60: /* Verb not recognized or malformed */
2157 ERR62; /* Subpattern name expected */
2162 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2166 *errorcodeptr = ERR44; /* Group name must not start with digit */
2170 while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype) != 0)
2174 if (namelen > MAX_NAME_SIZE)
2176 *errorcodeptr = ERR48;
2181 /* Subpattern names must not be empty, and their terminator is checked here.
2182 (What follows a verb name is checked separately.) */
2188 *errorcodeptr = ERR62; /* Subpattern name expected */
2191 if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2193 *errorcodeptr = ERR42;
2199 *namelenptr = namelen;
2210 /*************************************************
2211 * Manage callouts at start of cycle *
2212 *************************************************/
2214 /* At the start of a new item in parse_regex() we are able to record the
2215 details of the previous item in a prior callout, and also to set up an
2216 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2217 which would otherwise happen for items such as \Q that contribute nothing to
2221 ptr current pattern pointer
2222 pcalloutptr points to a pointer to previous callout, or NULL
2223 auto_callout TRUE if auto_callouts are enabled
2224 parsed_pattern the parsed pattern pointer
2227 Returns: possibly updated parsed_pattern pointer.
2231 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2232 uint32_t *parsed_pattern, compile_block *cb)
2234 uint32_t *previous_callout = *pcalloutptr;
2236 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2237 cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2239 if (!auto_callout) previous_callout = NULL; else
2241 if (previous_callout == NULL ||
2242 previous_callout != parsed_pattern - 4 ||
2243 previous_callout[3] != 255)
2245 previous_callout = parsed_pattern; /* Set up new automatic callout */
2246 parsed_pattern += 4;
2247 previous_callout[0] = META_CALLOUT_NUMBER;
2248 previous_callout[2] = 0;
2249 previous_callout[3] = 255;
2251 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2254 *pcalloutptr = previous_callout;
2255 return parsed_pattern;
2260 /*************************************************
2261 * Parse regex and identify named groups *
2262 *************************************************/
2264 /* This function is called first of all. It scans the pattern and does two
2265 things: (1) It identifies capturing groups and makes a table of named capturing
2266 groups so that information about them is fully available to both the compiling
2267 scans. (2) It writes a parsed version of the pattern with comments omitted and
2268 escapes processed into the parsed_pattern vector.
2271 ptr points to the start of the pattern
2272 options compiling dynamic options (may change during the scan)
2273 has_lookbehind points to a boolean, set TRUE if a lookbehind is found
2274 cb pointer to the compile data block
2276 Returns: zero on success or a non-zero error code, with the
2277 error offset placed in the cb field
2280 /* A structure and some flags for dealing with nested groups. */
2282 typedef struct nest_save {
2283 uint16_t nest_depth;
2284 uint16_t reset_group;
2290 #define NSF_RESET 0x0001u
2291 #define NSF_CONDASSERT 0x0002u
2293 /* Options that are changeable within the pattern must be tracked during
2294 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2295 but all must be tracked so that META_OPTIONS items set the correct values for
2296 the main compiling phase. */
2298 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2299 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2302 /* States used for analyzing ranges in character classes. The two OK values
2305 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2307 /* Only in 32-bit mode can there be literals > META_END. A macros encapsulates
2308 the storing of literal values in the parsed pattern. */
2310 #if PCRE2_CODE_UNIT_WIDTH == 32
2311 #define PARSED_LITERAL(c, p) \
2313 if (c >= META_END) *p++ = META_BIGVALUE; \
2315 okquantifier = TRUE; \
2318 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2321 /* Here's the actual function. */
2323 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2329 uint32_t class_range_state;
2330 uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */
2331 uint32_t *previous_callout = NULL;
2332 uint32_t *parsed_pattern = cb->parsed_pattern;
2333 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2334 uint32_t meta_quantifier = 0;
2335 uint32_t add_after_mark = 0;
2336 uint16_t nest_depth = 0;
2337 int after_manual_callout = 0;
2338 int expect_cond_assert = 0;
2342 BOOL inescq = FALSE;
2343 BOOL inverbname = FALSE;
2344 BOOL utf = (options & PCRE2_UTF) != 0;
2345 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2348 BOOL okquantifier = FALSE;
2351 PCRE2_SPTR ptrend = cb->end_pattern;
2352 PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
2354 nest_save *top_nest, *end_nests;
2356 /* Insert leading items for word and line matching (features provided for the
2357 benefit of pcre2grep). */
2359 if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
2361 *parsed_pattern++ = META_CIRCUMFLEX;
2362 *parsed_pattern++ = META_NOCAPTURE;
2364 else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
2366 *parsed_pattern++ = META_ESCAPE + ESC_b;
2367 *parsed_pattern++ = META_NOCAPTURE;
2370 /* If the pattern is actually a literal string, process it separately to avoid
2371 cluttering up the main loop. */
2373 if ((options & PCRE2_LITERAL) != 0)
2375 while (ptr < ptrend)
2377 if (parsed_pattern >= parsed_pattern_end)
2379 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2383 GETCHARINCTEST(c, ptr);
2385 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2386 auto_callout, parsed_pattern, cb);
2387 PARSED_LITERAL(c, parsed_pattern);
2392 /* Process a real regex which may contain meta-characters. */
2395 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2397 /* The size of the nest_save structure might not be a factor of the size of the
2398 workspace. Therefore we must round down end_nests so as to correctly avoid
2399 creating a nest_save that spans the end of the workspace. */
2401 end_nests = (nest_save *)((char *)end_nests -
2402 ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2404 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2406 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2408 /* Now scan the pattern */
2410 while (ptr < ptrend)
2412 int prev_expect_cond_assert;
2413 uint32_t min_repeat, max_repeat;
2414 uint32_t set, unset, *optset;
2415 uint32_t terminator;
2416 uint32_t prev_meta_quantifier;
2417 BOOL prev_okquantifier;
2421 if (parsed_pattern >= parsed_pattern_end)
2423 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2427 if (nest_depth > cb->cx->parens_nest_limit)
2430 goto FAILED; /* Parentheses too deeply nested */
2433 /* Get next input character, save its position for callout handling. */
2436 GETCHARINCTEST(c, ptr);
2438 /* Copy quoted literals until \E, allowing for the possibility of automatic
2439 callouts, except when processing a (*VERB) "name". */
2443 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2450 if (expect_cond_assert > 0) /* A literal is not allowed if we are */
2451 { /* expecting a conditional assertion, */
2452 ptr--; /* but an empty \Q\E sequence is OK. */
2456 if (!inverbname && after_manual_callout-- <= 0)
2457 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2458 auto_callout, parsed_pattern, cb);
2459 PARSED_LITERAL(c, parsed_pattern);
2460 meta_quantifier = 0;
2462 continue; /* Next character */
2465 /* If we are processing the "name" part of a (*VERB:NAME) item, all
2466 characters up to the closing parenthesis are literals except when
2467 PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2468 and \E and escaped characters are allowed (no character types such as \d). If
2469 PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2470 this by not entering the special (*VERB:NAME) processing - they are then
2471 picked up below. Note that c is a character, not a code unit, so we must not
2472 use MAX_255 to test its size because MAX_255 tests code units and is assumed
2473 TRUE in 8-bit mode. */
2477 /* EITHER: not both options set */
2478 ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2479 (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2480 #ifdef SUPPORT_UNICODE
2481 /* OR: character > 255 AND not Unicode Pattern White Space */
2482 (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2484 /* OR: not a # comment or isspace() white space */
2485 (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2486 #ifdef SUPPORT_UNICODE
2487 /* and not CHAR_NEL when Unicode is supported */
2492 PCRE2_SIZE verbnamelength;
2497 PARSED_LITERAL(c, parsed_pattern);
2500 case CHAR_RIGHT_PARENTHESIS:
2502 okquantifier = FALSE; /* Was probably set by literals */
2503 /* This is the length in characters */
2504 verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2505 /* But the limit on the length is in code units */
2506 if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2512 *verblengthptr = (uint32_t)verbnamelength;
2514 /* If this name was on a verb such as (*ACCEPT) which does not continue,
2515 a (*MARK) was generated for the name. We now add the original verb as the
2518 if (add_after_mark != 0)
2520 *parsed_pattern++ = add_after_mark;
2525 case CHAR_BACKSLASH:
2526 if ((options & PCRE2_ALT_VERBNAMES) != 0)
2528 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2530 if (errorcode != 0) goto FAILED;
2532 else escape = 0; /* Treat all as literal */
2537 PARSED_LITERAL(c, parsed_pattern);
2544 case ESC_E: /* Ignore */
2548 errorcode = ERR40; /* Invalid in verb name */
2552 continue; /* Next character in pattern */
2555 /* Not a verb name character. At this point we must process everything that
2556 must not change the quantification state. This is mainly comments, but we
2557 handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
2558 A+, as in Perl. An isolated \E is ignored. */
2560 if (c == CHAR_BACKSLASH && ptr < ptrend)
2562 if (*ptr == CHAR_Q || *ptr == CHAR_E)
2564 inescq = *ptr == CHAR_Q;
2570 /* Skip over whitespace and # comments in extended mode. Note that c is a
2571 character, not a code unit, so we must not use MAX_255 to test its size
2572 because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
2573 whitespace characters are those designated as "Pattern White Space" by
2574 Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
2575 U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
2576 subset of space characters that match \h and \v. */
2578 if ((options & PCRE2_EXTENDED) != 0)
2580 if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
2581 #ifdef SUPPORT_UNICODE
2582 if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
2584 if (c == CHAR_NUMBER_SIGN)
2586 while (ptr < ptrend)
2588 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
2589 { /* IS_NEWLINE sets cb->nllen. */
2594 #ifdef SUPPORT_UNICODE
2595 if (utf) FORWARDCHARTEST(ptr, ptrend);
2598 continue; /* Next character in pattern */
2602 /* Skip over bracketed comments */
2604 if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
2605 ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
2607 while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
2610 errorcode = ERR18; /* A special error for missing ) in a comment */
2611 goto FAILED; /* to make it easier to debug. */
2614 continue; /* Next character in pattern */
2617 /* If the next item is not a quantifier, fill in length of any previous
2618 callout and create an auto callout if required. */
2620 if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
2621 (c != CHAR_LEFT_CURLY_BRACKET ||
2623 !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
2625 if (after_manual_callout-- <= 0)
2626 parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
2627 parsed_pattern, cb);
2630 /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
2631 assertion, possibly preceded by a callout. If the value is 1, we have just
2632 had the callout and expect an assertion. There must be at least 3 more
2633 characters in all cases. When expect_cond_assert is 2, we know that the
2634 current character is an opening parenthesis, as otherwise we wouldn't be
2635 here. However, when it is 1, we need to check, and it's easiest just to check
2636 always. Note that expect_cond_assert may be negative, since all callouts just
2639 if (expect_cond_assert > 0)
2641 BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
2642 ptr[0] == CHAR_QUESTION_MARK;
2643 if (ok) switch(ptr[1])
2646 ok = expect_cond_assert == 2;
2649 case CHAR_EQUALS_SIGN:
2650 case CHAR_EXCLAMATION_MARK:
2653 case CHAR_LESS_THAN_SIGN:
2654 ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
2663 ptr--; /* Adjust error offset */
2669 /* Remember whether we are expecting a conditional assertion, and set the
2670 default for this item. */
2672 prev_expect_cond_assert = expect_cond_assert;
2673 expect_cond_assert = 0;
2675 /* Remember quantification status for the previous significant item, then set
2676 default for this item. */
2678 prev_okquantifier = okquantifier;
2679 prev_meta_quantifier = meta_quantifier;
2680 okquantifier = FALSE;
2681 meta_quantifier = 0;
2683 /* If the previous significant item was a quantifier, adjust the parsed code
2684 if there is a following modifier. The base meta value is always followed by
2685 the PLUS and QUERY values, in that order. We do this here rather than after
2686 reading a quantifier so that intervening comments and /x whitespace can be
2687 ignored without having to replicate code. */
2689 if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
2691 parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
2692 prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
2693 0x00020000u : 0x00010000u);
2694 continue; /* Next character in pattern */
2698 /* Process the next item in the main part of a pattern. */
2702 default: /* Non-special character */
2703 PARSED_LITERAL(c, parsed_pattern);
2707 /* ---- Escape sequence ---- */
2709 case CHAR_BACKSLASH:
2711 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2716 if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
2719 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
2721 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
2723 escape = 0; /* Treat as literal character */
2726 /* The escape was a data escape or literal character. */
2730 PARSED_LITERAL(c, parsed_pattern);
2733 /* The escape was a back (or forward) reference. We keep the offset in
2734 order to give a more useful diagnostic for a bad forward reference. For
2735 references to groups numbered less than 10 we can't use more than two items
2736 in parsed_pattern because they may be just two characters in the input (and
2737 in a 64-bit world an offset may need two elements). So for them, the offset
2738 of the first occurrent is held in a special vector. */
2740 else if (escape < 0)
2742 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
2744 *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
2747 if (cb->small_ref_offset[escape] == PCRE2_UNSET)
2748 cb->small_ref_offset[escape] = offset;
2752 PUTOFFSET(offset, parsed_pattern);
2754 okquantifier = TRUE;
2757 /* The escape was a character class such as \d etc. or other special
2758 escape indicator such as \A or \X. Most of them generate just a single
2759 parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
2760 value. They are supported only when Unicode is available. The type and
2761 value are packed into a single 32-bit value so that the whole sequences
2762 uses only two elements in the parsed_vector. This is because the same
2763 coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
2766 There are also some cases where the escape sequence is followed by a name:
2767 \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
2768 and \g'name' are subroutine calls by name; \g{name} is a synonym for
2769 \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
2770 and returned as a negative value (handled above). A name is coded as an
2771 offset into the pattern and a length. */
2773 else switch (escape)
2776 #ifdef NEVER_BACKSLASH_C
2780 if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
2786 okquantifier = TRUE;
2787 *parsed_pattern++ = META_ESCAPE + escape;
2791 #ifndef SUPPORT_UNICODE
2792 errorcode = ERR45; /* Supported only with Unicode support */
2801 okquantifier = TRUE;
2802 *parsed_pattern++ = META_ESCAPE + escape;
2805 default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
2806 *parsed_pattern++ = META_ESCAPE + escape;
2809 /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
2810 without Unicode support because it is checked when pcre2_compile() is
2819 okquantifier = TRUE;
2820 if ((options & PCRE2_UCP) == 0)
2822 *parsed_pattern++ = META_ESCAPE + escape;
2826 *parsed_pattern++ = META_ESCAPE +
2827 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
2833 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2838 *parsed_pattern++ = PT_SPACE << 16;
2843 *parsed_pattern++ = PT_WORD << 16;
2849 /* Unicode property matching */
2853 #ifdef SUPPORT_UNICODE
2856 uint16_t ptype = 0, pdata = 0;
2857 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
2859 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
2860 *parsed_pattern++ = META_ESCAPE + escape;
2861 *parsed_pattern++ = (ptype << 16) | pdata;
2862 okquantifier = TRUE;
2868 break; /* End \P and \p */
2870 /* When \g is used with quotes or angle brackets as delimiters, it is a
2871 numerical or named subroutine call, and control comes here. When used
2872 with brace delimiters it is a numberical back reference and does not come
2873 here because check_escape() returns it directly as a reference. \k is
2874 always a named back reference. */
2878 if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
2879 *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
2881 errorcode = (escape == ESC_g)? ERR57 : ERR69;
2884 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
2885 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
2886 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
2888 /* For a non-braced \g, check for a numerical recursion. */
2890 if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
2892 PCRE2_SPTR p = ptr + 1;
2894 if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
2897 if (p >= ptrend || *p != terminator)
2905 if (errorcode != 0) goto ESCAPE_FAILED;
2908 /* Not a numerical recursion */
2910 if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen,
2911 &errorcode, cb)) goto ESCAPE_FAILED;
2913 /* \k and \g when used with braces are back references, whereas \g used
2914 with quotes or angle brackets is a recursion */
2917 (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
2918 META_BACKREF_BYNAME : META_RECURSE_BYNAME;
2919 *parsed_pattern++ = namelen;
2921 PUTOFFSET(offset, parsed_pattern);
2922 okquantifier = TRUE;
2923 break; /* End special escape processing */
2925 break; /* End escape sequence processing */
2928 /* ---- Single-character special items ---- */
2930 case CHAR_CIRCUMFLEX_ACCENT:
2931 *parsed_pattern++ = META_CIRCUMFLEX;
2934 case CHAR_DOLLAR_SIGN:
2935 *parsed_pattern++ = META_DOLLAR;
2939 *parsed_pattern++ = META_DOT;
2940 okquantifier = TRUE;
2944 /* ---- Single-character quantifiers ---- */
2947 meta_quantifier = META_ASTERISK;
2948 goto CHECK_QUANTIFIER;
2951 meta_quantifier = META_PLUS;
2952 goto CHECK_QUANTIFIER;
2954 case CHAR_QUESTION_MARK:
2955 meta_quantifier = META_QUERY;
2956 goto CHECK_QUANTIFIER;
2959 /* ---- Potential {n,m} quantifier ---- */
2961 case CHAR_LEFT_CURLY_BRACKET:
2962 if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
2965 if (errorcode != 0) goto FAILED; /* Error in quantifier. */
2966 PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */
2967 break; /* No more quantifier processing */
2969 meta_quantifier = META_MINMAX;
2973 /* ---- Quantifier post-processing ---- */
2975 /* Check that a quantifier is allowed after the previous item. */
2978 if (!prev_okquantifier)
2984 /* Now we can put the quantifier into the parsed pattern vector. At this
2985 stage, we have only the basic quantifier. The check for a following + or ?
2986 modifier happens at the top of the loop, after any intervening comments
2987 have been removed. */
2989 *parsed_pattern++ = meta_quantifier;
2990 if (c == CHAR_LEFT_CURLY_BRACKET)
2992 *parsed_pattern++ = min_repeat;
2993 *parsed_pattern++ = max_repeat;
2998 /* ---- Character class ---- */
3000 case CHAR_LEFT_SQUARE_BRACKET:
3001 okquantifier = TRUE;
3003 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3004 used for "start of word" and "end of word". As these are otherwise illegal
3005 sequences, we don't break anything by recognizing them. They are replaced
3006 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3007 erroneous and are handled by the normal code below. */
3009 if (ptrend - ptr >= 6 &&
3010 (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3011 PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3013 *parsed_pattern++ = META_ESCAPE + ESC_b;
3015 if (ptr[2] == CHAR_LESS_THAN_SIGN)
3017 *parsed_pattern++ = META_LOOKAHEAD;
3021 *parsed_pattern++ = META_LOOKBEHIND;
3022 *has_lookbehind = TRUE;
3024 /* The offset is used only for the "non-fixed length" error; this won't
3025 occur here, so just store zero. */
3027 PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3030 if ((options & PCRE2_UCP) == 0)
3031 *parsed_pattern++ = META_ESCAPE + ESC_w;
3034 *parsed_pattern++ = META_ESCAPE + ESC_p;
3035 *parsed_pattern++ = PT_WORD << 16;
3037 *parsed_pattern++ = META_KET;
3042 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3043 they are encountered at the top level, so we'll do that too. */
3045 if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3046 *ptr == CHAR_EQUALS_SIGN) &&
3047 check_posix_syntax(ptr, ptrend, &tempptr))
3049 errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3053 /* Process a regular character class. If the first character is '^', set
3054 the negation flag. If the first few characters (either before or after ^)
3055 are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3056 This makes for compatibility with Perl. */
3058 negate_class = FALSE;
3059 while (ptr < ptrend)
3061 GETCHARINCTEST(c, ptr);
3062 if (c == CHAR_BACKSLASH)
3064 if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3065 else if (ptrend - ptr >= 3 &&
3066 PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3071 else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3072 (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */
3074 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3075 negate_class = TRUE;
3079 /* Now the real contents of the class; c has the first "real" character.
3080 Empty classes are permitted only if the option is set. */
3082 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3083 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3085 *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3086 break; /* End of class processing */
3089 /* Process a non-empty class. */
3091 *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3092 class_range_state = RANGE_NO;
3094 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3095 because there are holes in the encoding, and simply using the range A-Z
3096 (for example) would include the characters in the holes. This applies only
3097 to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3098 in this respect. In order to accommodate this, we keep track of whether
3099 character values are literal or not, and a state variable for handling
3102 /* Loop for the contents of the class */
3106 BOOL char_is_literal = TRUE;
3108 /* Inside \Q...\E everything is literal except \E */
3112 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3114 inescq = FALSE; /* Reset literal state */
3115 ptr++; /* Skip the 'E' */
3116 goto CLASS_CONTINUE;
3121 /* Skip over space and tab (only) in extended-more mode. */
3123 if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3124 (c == CHAR_SPACE || c == CHAR_HT))
3125 goto CLASS_CONTINUE;
3127 /* Handle POSIX class names. Perl allows a negation extension of the
3128 form [:^name:]. A square bracket that doesn't match the syntax is
3129 treated as a literal. We also recognize the POSIX constructions
3130 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3133 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3134 ptrend - ptr >= 3 &&
3135 (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3136 *ptr == CHAR_EQUALS_SIGN) &&
3137 check_posix_syntax(ptr, ptrend, &tempptr))
3139 BOOL posix_negate = FALSE;
3142 /* Perl treats a hyphen before a POSIX class as a literal, not the
3143 start of a range. However, it gives a warning in its warning mode. PCRE
3144 does not have a warning mode, so we give an error, because this is
3145 likely an error on the user's part. */
3147 if (class_range_state == RANGE_STARTED)
3153 if (*ptr != CHAR_COLON)
3159 if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3161 posix_negate = TRUE;
3165 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3166 if (posix_class < 0)
3173 /* Perl treats a hyphen after a POSIX class as a literal, not the
3174 start of a range. However, it gives a warning in its warning mode
3175 unless the hyphen is the last character in the class. PCRE does not
3176 have a warning mode, so we give an error, because this is likely an
3177 error on the user's part. */
3179 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3180 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3186 /* Set "a hyphen is not the start of a range" for the -] case, and also
3187 in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3188 fuzzers do that kind of thing) and *then* a hyphen. This causes that
3189 hyphen to be treated as a literal. I don't think it's worth setting up
3190 special apparatus to do otherwise. */
3192 class_range_state = RANGE_NO;
3194 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
3195 use Unicode properties \p or \P or, in one case, \h or \H. The
3196 substitutes table has two values per class, containing the type and
3197 value of a \p or \P item. The special cases are specified with a
3198 negative type: a non-zero value causes \h or \H to be used, and a zero
3199 value falls through to behave like a non-UCP POSIX class. */
3201 #ifdef SUPPORT_UNICODE
3202 if ((options & PCRE2_UCP) != 0)
3204 int ptype = posix_substitutes[2*posix_class];
3205 int pvalue = posix_substitutes[2*posix_class + 1];
3208 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3209 *parsed_pattern++ = (ptype << 16) | pvalue;
3210 goto CLASS_CONTINUE;
3215 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3216 goto CLASS_CONTINUE;
3221 #endif /* SUPPORT_UNICODE */
3223 /* Non-UCP POSIX class */
3225 *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3226 *parsed_pattern++ = posix_class;
3229 /* Handle potential start of range */
3231 else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3233 *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3234 META_RANGE_LITERAL : META_RANGE_ESCAPED;
3235 class_range_state = RANGE_STARTED;
3238 /* Handle a literal character */
3240 else if (c != CHAR_BACKSLASH)
3243 if (class_range_state == RANGE_STARTED)
3245 if (c == parsed_pattern[-2]) /* Optimize one-char range */
3247 else if (parsed_pattern[-2] > c) /* Check range is in order */
3254 if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3255 parsed_pattern[-1] = META_RANGE_ESCAPED;
3256 PARSED_LITERAL(c, parsed_pattern);
3258 class_range_state = RANGE_NO;
3260 else /* Potential start of range */
3262 class_range_state = char_is_literal?
3263 RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3264 PARSED_LITERAL(c, parsed_pattern);
3268 /* Handle escapes in a class */
3273 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode,
3277 CLASS_ESCAPE_FAILED:
3278 if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3281 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3283 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3285 escape = 0; /* Treat as literal character */
3288 if (escape == 0) /* Escaped character code point is in c */
3290 char_is_literal = FALSE;
3294 /* These three escapes do not alter the class range state. */
3296 if (escape == ESC_b)
3298 c = CHAR_BS; /* \b is backspace in a class */
3299 char_is_literal = FALSE;
3303 else if (escape == ESC_Q)
3305 inescq = TRUE; /* Enter literal mode */
3306 goto CLASS_CONTINUE;
3309 else if (escape == ESC_E) /* Ignore orphan \E */
3310 goto CLASS_CONTINUE;
3312 /* The second part of a range can be a single-character escape
3313 sequence (detected above), but not any of the other escapes. Perl
3314 treats a hyphen as a literal in such circumstances. However, in Perl's
3315 warning mode, a warning is given, so PCRE now faults it, as it is
3316 almost certainly a mistake on the user's part. */
3318 if (class_range_state == RANGE_STARTED)
3321 goto CLASS_ESCAPE_FAILED;
3324 /* Of the remaining escapes, only those that define characters are
3325 allowed in a class. None may start a range. */
3327 class_range_state = RANGE_NO;
3331 errorcode = ERR71; /* Not supported in a class */
3332 goto CLASS_ESCAPE_FAILED;
3338 *parsed_pattern++ = META_ESCAPE + escape;
3341 /* These escapes are converted to Unicode property tests when
3342 PCRE2_UCP is set. */
3350 if ((options & PCRE2_UCP) == 0)
3352 *parsed_pattern++ = META_ESCAPE + escape;
3356 *parsed_pattern++ = META_ESCAPE +
3357 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3363 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3368 *parsed_pattern++ = PT_SPACE << 16;
3373 *parsed_pattern++ = PT_WORD << 16;
3379 /* Explicit Unicode property matching */
3383 #ifdef SUPPORT_UNICODE
3386 uint16_t ptype = 0, pdata = 0;
3387 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3389 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3390 *parsed_pattern++ = META_ESCAPE + escape;
3391 *parsed_pattern++ = (ptype << 16) | pdata;
3395 goto CLASS_ESCAPE_FAILED;
3397 break; /* End \P and \p */
3399 default: /* All others are not allowed in a class */
3402 goto CLASS_ESCAPE_FAILED;
3405 /* Perl gives a warning unless a following hyphen is the last character
3406 in the class. PCRE throws an error. */
3408 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3409 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3416 /* Proceed to next thing in the class. */
3421 errorcode = ERR6; /* Missing terminating ']' */
3424 GETCHARINCTEST(c, ptr);
3425 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3426 } /* End of class-processing loop */
3428 if (class_range_state == RANGE_STARTED)
3430 parsed_pattern[-1] = CHAR_MINUS;
3431 class_range_state = RANGE_NO;
3434 *parsed_pattern++ = META_CLASS_END;
3435 break; /* End of character class */
3438 /* ---- Opening parenthesis ---- */
3440 case CHAR_LEFT_PARENTHESIS:
3441 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3443 /* If ( is not followed by ? it is either a capture or a special verb. */
3445 if (*ptr != CHAR_QUESTION_MARK)
3449 /* Handle capturing brackets (or non-capturing if auto-capture is turned
3452 if (*ptr != CHAR_ASTERISK)
3455 if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3458 *parsed_pattern++ = META_CAPTURE | cb->bracount;
3460 else *parsed_pattern++ = META_NOCAPTURE;
3464 /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
3466 /* Do nothing for (*) so it gives a "bad quantifier" error rather than
3467 "(*MARK) must have an argument". */
3469 else if (ptrend - ptr > 1 && ptr[1] != CHAR_RIGHT_PARENTHESIS)
3472 if (!read_name(&ptr, ptrend, 0, &offset, &name, &namelen, &errorcode,
3474 if (ptr >= ptrend || (*ptr != CHAR_COLON &&
3475 *ptr != CHAR_RIGHT_PARENTHESIS))
3477 errorcode = ERR60; /* Malformed */
3481 /* Scan the table of verb names */
3483 for (i = 0; i < verbcount; i++)
3485 if (namelen == verbs[i].len &&
3486 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3488 vn += verbs[i].len + 1;
3493 errorcode = ERR60; /* Verb not recognized */
3497 /* An empty argument is treated as no argument. */
3499 if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
3500 ptr[1] == CHAR_RIGHT_PARENTHESIS)
3501 ptr++; /* Advance to the closing parens */
3503 /* Check for mandatory non-empty argument; this is (*MARK) */
3505 if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
3511 /* It appears that Perl allows any characters whatsoever, other than a
3512 closing parenthesis, to appear in arguments ("names"), so we no longer
3513 insist on letters, digits, and underscores. Perl does not, however, do
3514 any interpretation within arguments, and has no means of including a
3515 closing parenthesis. PCRE supports escape processing but only when it
3516 is requested by an option. We set inverbname TRUE here, and let the
3517 main loop take care of this so that escape and \x processing is done by
3518 the main code above. */
3520 if (*ptr++ == CHAR_COLON) /* Skip past : or ) */
3522 /* Some optional arguments can be treated as a preceding (*MARK) */
3524 if (verbs[i].has_arg < 0)
3526 add_after_mark = verbs[i].meta;
3527 *parsed_pattern++ = META_MARK;
3530 /* The remaining verbs with arguments (except *MARK) need a different
3535 *parsed_pattern++ = verbs[i].meta +
3536 ((verbs[i].meta != META_MARK)? 0x00010000u:0);
3539 /* Set up for reading the name in the main loop. */
3541 verblengthptr = parsed_pattern++;
3542 verbnamestart = ptr;
3545 else /* No verb "name" argument */
3547 *parsed_pattern++ = verbs[i].meta;
3549 } /* End of (*VERB) handling */
3550 break; /* Done with this parenthesis */
3551 } /* End of groups that don't start with (? */
3554 /* ---- Items starting (? ---- */
3556 /* The type of item is determined by what follows (?. Handle (?| and option
3557 changes under "default" because both need a new block on the nest stack.
3558 Comments starting with (?# are handled above. Note that there is some
3559 ambiguity about the sequence (?- because if a digit follows it's a relative
3560 recursion or subroutine call whereas otherwise it's an option unsetting. */
3562 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3567 if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
3568 goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */
3570 /* We now have either (?| or a (possibly empty) option setting,
3571 optionally followed by a non-capturing group. */
3574 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3575 else if (++top_nest >= end_nests)
3580 top_nest->nest_depth = nest_depth;
3581 top_nest->flags = 0;
3582 top_nest->options = options & PARSE_TRACKED_OPTIONS;
3584 /* Start of non-capturing group that resets the capture count for each
3587 if (*ptr == CHAR_VERTICAL_LINE)
3589 top_nest->reset_group = (uint16_t)cb->bracount;
3590 top_nest->max_group = (uint16_t)cb->bracount;
3591 top_nest->flags |= NSF_RESET;
3592 cb->external_flags |= PCRE2_DUPCAPUSED;
3593 *parsed_pattern++ = META_NOCAPTURE;
3597 /* Scan for options imnsxJU to be set or unset. */
3601 BOOL hyphenok = TRUE;
3602 uint32_t oldoptions = options;
3604 top_nest->reset_group = 0;
3605 top_nest->max_group = 0;
3609 /* ^ at the start unsets imnsx and disables the subsequent use of - */
3611 if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
3613 options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
3614 PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
3619 while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
3628 ptr--; /* Correct the offset */
3635 case CHAR_J: /* Record that it changed in the external options */
3636 *optset |= PCRE2_DUPNAMES;
3637 cb->external_flags |= PCRE2_JCHANGED;
3640 case CHAR_i: *optset |= PCRE2_CASELESS; break;
3641 case CHAR_m: *optset |= PCRE2_MULTILINE; break;
3642 case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
3643 case CHAR_s: *optset |= PCRE2_DOTALL; break;
3644 case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
3646 /* If x appears twice it sets the extended extended option. */
3649 *optset |= PCRE2_EXTENDED;
3650 if (ptr < ptrend && *ptr == CHAR_x)
3652 *optset |= PCRE2_EXTENDED_MORE;
3659 ptr--; /* Correct the offset */
3664 /* If we are setting extended without extended-more, ensure that any
3665 existing extended-more gets unset. Also, unsetting extended must also
3666 unset extended-more. */
3668 if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
3669 (unset & PCRE2_EXTENDED) != 0)
3670 unset |= PCRE2_EXTENDED_MORE;
3672 options = (options | set) & (~unset);
3674 /* If the options ended with ')' this is not the start of a nested
3675 group with option changes, so the options change at this level.
3676 In this case, if the previous level set up a nest block, discard the
3677 one we have just created. Otherwise adjust it for the previous level.
3678 If the options ended with ':' we are starting a non-capturing group,
3679 possibly with an options setting. */
3681 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3682 if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
3684 nest_depth--; /* This is not a nested group after all. */
3685 if (top_nest > (nest_save *)(cb->start_workspace) &&
3686 (top_nest-1)->nest_depth == nest_depth) top_nest--;
3687 else top_nest->nest_depth = nest_depth;
3689 else *parsed_pattern++ = META_NOCAPTURE;
3691 /* If nothing changed, no need to record. */
3693 if (options != oldoptions)
3695 *parsed_pattern++ = META_OPTIONS;
3696 *parsed_pattern++ = options;
3698 } /* End options processing */
3699 break; /* End default case after (? */
3702 /* ---- Python syntax support ---- */
3705 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3707 /* (?P<name> is the same as (?<name>, which defines a named group. */
3709 if (*ptr == CHAR_LESS_THAN_SIGN)
3711 terminator = CHAR_GREATER_THAN_SIGN;
3715 /* (?P>name) is the same as (?&name), which is a recursion or subroutine
3718 if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
3720 /* (?P=name) is the same as \k<name>, a back reference by name. Anything
3721 else after (?P is an error. */
3723 if (*ptr != CHAR_EQUALS_SIGN)
3728 if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name,
3729 &namelen, &errorcode, cb)) goto FAILED;
3730 *parsed_pattern++ = META_BACKREF_BYNAME;
3731 *parsed_pattern++ = namelen;
3732 PUTOFFSET(offset, parsed_pattern);
3733 okquantifier = TRUE;
3734 break; /* End of (?P processing */
3737 /* ---- Recursion/subroutine calls by number ---- */
3740 i = 0; /* (?R) == (?R0) */
3742 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
3749 /* An item starting (?- followed by a digit comes here via the "default"
3750 case because (?- followed by a non-digit is an options setting. */
3753 if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
3755 errorcode = ERR29; /* Missing number */
3760 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
3761 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
3763 if (!read_number(&ptr, ptrend,
3764 (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
3765 MAX_GROUP_NUMBER, ERR61,
3766 &i, &errorcode)) goto FAILED;
3767 if (i < 0) /* NB (?0) is permitted */
3769 errorcode = ERR15; /* Unknown group */
3772 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
3773 goto UNCLOSED_PARENTHESIS;
3776 *parsed_pattern++ = META_RECURSE | (uint32_t)i;
3777 offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
3779 PUTOFFSET(offset, parsed_pattern);
3780 okquantifier = TRUE;
3781 break; /* End of recursive call by number handling */
3784 /* ---- Recursion/subroutine calls by name ---- */
3786 case CHAR_AMPERSAND:
3788 if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name,
3789 &namelen, &errorcode, cb)) goto FAILED;
3790 *parsed_pattern++ = META_RECURSE_BYNAME;
3791 *parsed_pattern++ = namelen;
3792 PUTOFFSET(offset, parsed_pattern);
3793 okquantifier = TRUE;
3796 /* ---- Callout with numerical or string argument ---- */
3799 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3801 /* If the previous item was a condition starting (?(? an assertion,
3802 optionally preceded by a callout, is expected. This is checked later on,
3803 during actual compilation. However we need to identify this kind of
3804 assertion in this pass because it must not be qualified. The value of
3805 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
3806 for a callout - still leaving a positive value that identifies the
3807 assertion. Multiple callouts or any other items will make it zero or
3808 less, which doesn't matter because they will cause an error later. */
3810 expect_cond_assert = prev_expect_cond_assert - 1;
3812 /* If previous_callout is not NULL, it means this follows a previous
3813 callout. If it was a manual callout, do nothing; this means its "length
3814 of next pattern item" field will remain zero. If it was an automatic
3815 callout, abolish it. */
3817 if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
3818 previous_callout == parsed_pattern - 4 &&
3819 parsed_pattern[-1] == 255)
3820 parsed_pattern = previous_callout;
3822 /* Save for updating next pattern item length, and skip one item before
3825 previous_callout = parsed_pattern;
3826 after_manual_callout = 1;
3828 /* Handle a string argument; specific delimiter is required. */
3830 if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
3832 PCRE2_SIZE calloutlength;
3833 PCRE2_SPTR startptr = ptr;
3836 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
3838 if (*ptr == PRIV(callout_start_delims)[i])
3840 delimiter = PRIV(callout_end_delims)[i];
3850 *parsed_pattern = META_CALLOUT_STRING;
3851 parsed_pattern += 3; /* Skip pattern info */
3855 if (++ptr >= ptrend)
3858 ptr = startptr; /* To give a more useful message */
3861 if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
3865 calloutlength = (PCRE2_SIZE)(ptr - startptr);
3866 if (calloutlength > UINT32_MAX)
3871 *parsed_pattern++ = (uint32_t)calloutlength;
3872 offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
3873 PUTOFFSET(offset, parsed_pattern);
3876 /* Handle a callout with an optional numerical argument, which must be
3877 less than or equal to 255. A missing argument gives 0. */
3882 *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */
3883 parsed_pattern += 3; /* Skip pattern info */
3884 while (ptr < ptrend && IS_DIGIT(*ptr))
3886 n = n * 10 + *ptr++ - CHAR_0;
3893 *parsed_pattern++ = n;
3896 /* Both formats must have a closing parenthesis */
3898 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
3905 /* Remember the offset to the next item in the pattern, and set a default
3906 length. This should get updated after the next item is read. */
3908 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
3909 previous_callout[2] = 0;
3910 break; /* End callout */
3913 /* ---- Conditional group ---- */
3915 /* A condition can be an assertion, a number (referring to a numbered
3916 group's having been set), a name (referring to a named group), or 'R',
3917 referring to overall recursion. R<digits> and R&name are also permitted
3918 for recursion state tests. Numbers may be preceded by + or - to specify a
3919 relative group number.
3921 There are several syntaxes for testing a named group: (?(name)) is used
3922 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3924 There are two unfortunate ambiguities. 'R' can be the recursive thing or
3925 the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
3926 the Perl DEFINE feature or the Python named test. We look for a name
3927 first; if not found, we try the other case.
3929 For compatibility with auto-callouts, we allow a callout to be specified
3930 before a condition that is an assertion. */
3932 case CHAR_LEFT_PARENTHESIS:
3933 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3936 /* If the next character is ? there must be an assertion next (optionally
3937 preceded by a callout). We do not check this here, but instead we set
3938 expect_cond_assert to 2. If this is still greater than zero (callouts
3939 decrement it) when the next assertion is read, it will be marked as a
3940 condition that must not be repeated. A value greater than zero also
3941 causes checking that an assertion (possibly with callout) follows. */
3943 if (*ptr == CHAR_QUESTION_MARK)
3945 *parsed_pattern++ = META_COND_ASSERT;
3946 ptr--; /* Pull pointer back to the opening parenthesis. */
3947 expect_cond_assert = 2;
3948 break; /* End of conditional */
3951 /* Handle (?([+-]number)... */
3953 if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3961 *parsed_pattern++ = META_COND_NUMBER;
3962 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
3963 PUTOFFSET(offset, parsed_pattern);
3964 *parsed_pattern++ = i;
3966 else if (errorcode != 0) goto FAILED; /* Number too big */
3968 /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
3970 else if (ptrend - ptr >= 10 &&
3971 PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
3972 ptr[7] != CHAR_RIGHT_PARENTHESIS)
3979 if (*ptr == CHAR_GREATER_THAN_SIGN)
3985 /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
3986 references its argument twice. */
3988 if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
3989 goto BAD_VERSION_CONDITION;
3991 if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
3994 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
3995 if (*ptr == CHAR_DOT)
3997 if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
3998 minor = (*ptr++ - CHAR_0) * 10;
3999 if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4000 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4001 goto BAD_VERSION_CONDITION;
4004 *parsed_pattern++ = META_COND_VERSION;
4005 *parsed_pattern++ = ge;
4006 *parsed_pattern++ = major;
4007 *parsed_pattern++ = minor;
4010 /* All the remaining cases now require us to read a name. We cannot at
4011 this stage distinguish ambiguous cases such as (?(R12) which might be a
4012 recursion test by number or a name, because the named groups have not yet
4013 all been identified. Those cases are treated as names, but given a
4014 different META code. */
4018 BOOL was_r_ampersand = FALSE;
4020 if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4022 terminator = CHAR_RIGHT_PARENTHESIS;
4023 was_r_ampersand = TRUE;
4026 else if (*ptr == CHAR_LESS_THAN_SIGN)
4027 terminator = CHAR_GREATER_THAN_SIGN;
4028 else if (*ptr == CHAR_APOSTROPHE)
4029 terminator = CHAR_APOSTROPHE;
4032 terminator = CHAR_RIGHT_PARENTHESIS;
4033 ptr--; /* Point to char before name */
4035 if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen,
4036 &errorcode, cb)) goto FAILED;
4038 /* Handle (?(R&name) */
4040 if (was_r_ampersand)
4042 *parsed_pattern = META_COND_RNAME;
4043 ptr--; /* Back to closing parens */
4046 /* Handle (?(name). If the name is "DEFINE" we identify it with a
4047 special code. Likewise if the name consists of R followed only by
4048 digits. Otherwise, handle it like a quoted name. */
4050 else if (terminator == CHAR_RIGHT_PARENTHESIS)
4052 if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4053 *parsed_pattern = META_COND_DEFINE;
4056 for (i = 1; i < (int)namelen; i++)
4057 if (!IS_DIGIT(name[i])) break;
4058 *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4059 META_COND_RNUMBER : META_COND_NAME;
4061 ptr--; /* Back to closing parens */
4064 /* Handle (?('name') or (?(<name>) */
4066 else *parsed_pattern = META_COND_NAME;
4068 /* All these cases except DEFINE end with the name length and offset;
4069 DEFINE just has an offset (for the "too many branches" error). */
4071 if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4072 PUTOFFSET(offset, parsed_pattern);
4073 } /* End cases that read a name */
4075 /* Check the closing parenthesis of the condition */
4077 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4083 break; /* End of condition processing */
4086 /* ---- Atomic group ---- */
4088 case CHAR_GREATER_THAN_SIGN:
4089 *parsed_pattern++ = META_ATOMIC;
4095 /* ---- Lookahead assertions ---- */
4097 case CHAR_EQUALS_SIGN:
4098 *parsed_pattern++ = META_LOOKAHEAD;
4100 goto POST_ASSERTION;
4102 case CHAR_EXCLAMATION_MARK:
4103 *parsed_pattern++ = META_LOOKAHEADNOT;
4105 goto POST_ASSERTION;
4108 /* ---- Lookbehind assertions ---- */
4110 /* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the
4111 start of the name of a capturing group. */
4113 case CHAR_LESS_THAN_SIGN:
4114 if (ptrend - ptr <= 1 ||
4115 (ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK))
4117 terminator = CHAR_GREATER_THAN_SIGN;
4120 *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4121 META_LOOKBEHIND : META_LOOKBEHINDNOT;
4122 *has_lookbehind = TRUE;
4123 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4124 PUTOFFSET(offset, parsed_pattern);
4128 /* If the previous item was a condition starting (?(? an assertion,
4129 optionally preceded by a callout, is expected. This is checked later on,
4130 during actual compilation. However we need to identify this kind of
4131 assertion in this pass because it must not be qualified. The value of
4132 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4133 for a callout - still leaving a positive value that identifies the
4134 assertion. Multiple callouts or any other items will make it zero or
4135 less, which doesn't matter because they will cause an error later. */
4139 if (prev_expect_cond_assert > 0)
4141 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4142 else if (++top_nest >= end_nests)
4147 top_nest->nest_depth = nest_depth;
4148 top_nest->flags = NSF_CONDASSERT;
4149 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4154 /* ---- Define a named group ---- */
4156 /* A named group may be defined as (?'name') or (?<name>). In the latter
4157 case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4158 terminator set to '>'. */
4160 case CHAR_APOSTROPHE:
4161 terminator = CHAR_APOSTROPHE; /* Terminator */
4164 if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen,
4165 &errorcode, cb)) goto FAILED;
4167 /* We have a name for this capturing group. It is also assigned a number,
4168 which is its primary means of identification. */
4171 *parsed_pattern++ = META_CAPTURE | cb->bracount;
4174 /* Check not too many names */
4176 if (cb->names_found >= MAX_NAME_COUNT)
4182 /* Adjust the entry size to accommodate the longest name found. */
4184 if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4185 cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4187 /* Scan the list to check for duplicates. For duplicate names, if the
4188 number is the same, break the loop, which causes the name to be
4189 discarded; otherwise, if DUPNAMES is not set, give an error.
4190 If it is set, allow the name with a different number, but continue
4191 scanning in case this is a duplicate with the same number. For
4192 non-duplicate names, give an error if the number is duplicated. */
4195 ng = cb->named_groups;
4196 for (i = 0; i < cb->names_found; i++, ng++)
4198 if (namelen == ng->length &&
4199 PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4201 if (ng->number == cb->bracount) break;
4202 if ((options & PCRE2_DUPNAMES) == 0)
4207 isdupname = ng->isdup = TRUE; /* Mark as a duplicate */
4208 cb->dupnames = TRUE; /* Duplicate names exist */
4210 else if (ng->number == cb->bracount)
4217 if (i < cb->names_found) break; /* Ignore duplicate with same number */
4219 /* Increase the list size if necessary */
4221 if (cb->names_found >= cb->named_group_list_size)
4223 uint32_t newsize = cb->named_group_list_size * 2;
4224 named_group *newspace =
4225 cb->cx->memctl.malloc(newsize * sizeof(named_group),
4226 cb->cx->memctl.memory_data);
4227 if (newspace == NULL)
4233 memcpy(newspace, cb->named_groups,
4234 cb->named_group_list_size * sizeof(named_group));
4235 if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4236 cb->cx->memctl.free((void *)cb->named_groups,
4237 cb->cx->memctl.memory_data);
4238 cb->named_groups = newspace;
4239 cb->named_group_list_size = newsize;
4242 /* Add this name to the list */
4244 cb->named_groups[cb->names_found].name = name;
4245 cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4246 cb->named_groups[cb->names_found].number = cb->bracount;
4247 cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4250 } /* End of (? switch */
4251 break; /* End of ( handling */
4254 /* ---- Branch terminators ---- */
4256 /* Alternation: reset the capture count if we are in a (?| group. */
4258 case CHAR_VERTICAL_LINE:
4259 if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4260 (top_nest->flags & NSF_RESET) != 0)
4262 if (cb->bracount > top_nest->max_group)
4263 top_nest->max_group = (uint16_t)cb->bracount;
4264 cb->bracount = top_nest->reset_group;
4266 *parsed_pattern++ = META_ALT;
4269 /* End of group; reset the capture count to the maximum if we are in a (?|
4270 group and/or reset the options that are tracked during parsing. Disallow
4271 quantifier for a condition that is an assertion. */
4273 case CHAR_RIGHT_PARENTHESIS:
4274 okquantifier = TRUE;
4275 if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4277 options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4278 if ((top_nest->flags & NSF_RESET) != 0 &&
4279 top_nest->max_group > cb->bracount)
4280 cb->bracount = top_nest->max_group;
4281 if ((top_nest->flags & NSF_CONDASSERT) != 0)
4282 okquantifier = FALSE;
4283 if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4286 if (nest_depth == 0) /* Unmatched closing parenthesis */
4292 *parsed_pattern++ = META_KET;
4294 } /* End of switch on pattern character */
4295 } /* End of main character scan loop */
4297 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4299 if (inverbname && ptr >= ptrend)
4305 /* Manage callout for the final item */
4308 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4309 parsed_pattern, cb);
4311 /* Insert trailing items for word and line matching (features provided for the
4312 benefit of pcre2grep). */
4314 if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
4316 *parsed_pattern++ = META_KET;
4317 *parsed_pattern++ = META_DOLLAR;
4319 else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
4321 *parsed_pattern++ = META_KET;
4322 *parsed_pattern++ = META_ESCAPE + ESC_b;
4325 /* Terminate the parsed pattern, then return success if all groups are closed.
4326 Otherwise we have unclosed parentheses. */
4328 if (parsed_pattern >= parsed_pattern_end)
4330 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
4334 *parsed_pattern = META_END;
4335 if (nest_depth == 0) return 0;
4337 UNCLOSED_PARENTHESIS:
4340 /* Come here for all failures. */
4343 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4346 /* Some errors need to indicate the previous character. */
4352 /* This failure happens several times. */
4354 BAD_VERSION_CONDITION:
4361 /*************************************************
4362 * Find first significant opcode *
4363 *************************************************/
4365 /* This is called by several functions that scan a compiled expression looking
4366 for a fixed first character, or an anchoring opcode etc. It skips over things
4367 that do not influence this. For some calls, it makes sense to skip negative
4368 forward and all backward assertions, and also the \b assertion; for others it
4372 code pointer to the start of the group
4373 skipassert TRUE if certain assertions are to be skipped
4375 Returns: pointer to the first significant opcode
4378 static const PCRE2_UCHAR*
4379 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
4387 case OP_ASSERTBACK_NOT:
4388 if (!skipassert) return code;
4389 do code += GET(code, 1); while (*code == OP_ALT);
4390 code += PRIV(OP_lengths)[*code];
4393 case OP_WORD_BOUNDARY:
4394 case OP_NOT_WORD_BOUNDARY:
4395 if (!skipassert) return code;
4405 code += PRIV(OP_lengths)[*code];
4408 case OP_CALLOUT_STR:
4409 code += GET(code, 1 + 2*LINK_SIZE);
4413 code += 2 + GET(code, 2) + LINK_SIZE;
4418 if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */
4419 code[GET(code, 1)] != OP_KET) /* More than one branch */
4421 code += GET(code, 1) + 1 + LINK_SIZE;
4428 /* Control never reaches here */
4433 #ifdef SUPPORT_UNICODE
4434 /*************************************************
4435 * Get othercase range *
4436 *************************************************/
4438 /* This function is passed the start and end of a class range in UCP mode. It
4439 searches up the characters, looking for ranges of characters in the "other"
4440 case. Each call returns the next one, updating the start address. A character
4441 with multiple other cases is returned on its own with a special return value.
4444 cptr points to starting character value; updated
4446 ocptr where to put start of othercase range
4447 odptr where to put end of othercase range
4449 Yield: -1 when no more
4450 0 when a range is returned
4451 >0 the CASESET offset for char with multiple other cases
4452 in this case, ocptr contains the original
4456 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
4459 uint32_t c, othercase, next;
4462 /* Find the first character that has an other case. If it has multiple other
4463 cases, return its case offset value. */
4465 for (c = *cptr; c <= d; c++)
4467 if ((co = UCD_CASESET(c)) != 0)
4469 *ocptr = c++; /* Character that has the set */
4470 *cptr = c; /* Rest of input range */
4473 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4476 if (c > d) return -1; /* Reached end of range */
4478 /* Found a character that has a single other case. Search for the end of the
4479 range, which is either the end of the input range, or a character that has zero
4480 or more than one other cases. */
4483 next = othercase + 1;
4485 for (++c; c <= d; c++)
4487 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4491 *odptr = next - 1; /* End of othercase range */
4492 *cptr = c; /* Rest of input range */
4495 #endif /* SUPPORT_UNICODE */
4499 /*************************************************
4500 * Add a character or range to a class (internal) *
4501 *************************************************/
4503 /* This function packages up the logic of adding a character or range of
4504 characters to a class. The character values in the arguments will be within the
4505 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4506 called only from within the "add to class" group of functions, some of which
4507 are recursive and mutually recursive. The external entry point is
4511 classbits the bit map for characters < 256
4512 uchardptr points to the pointer for extra data
4513 options the options word
4515 start start of range character
4516 end end of range character
4518 Returns: the number of < 256 characters added
4519 the pointer to extra data is updated
4523 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4524 uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
4527 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
4528 unsigned int n8 = 0;
4530 /* If caseless matching is required, scan the range and process alternate
4531 cases. In Unicode, there are 8-bit characters that have alternate cases that
4532 are greater than 255 and vice-versa. Sometimes we can just extend the original
4535 if ((options & PCRE2_CASELESS) != 0)
4537 #ifdef SUPPORT_UNICODE
4538 if ((options & PCRE2_UTF) != 0)
4543 options &= ~PCRE2_CASELESS; /* Remove for recursive calls */
4546 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4548 /* Handle a single character that has more than one other case. */
4550 if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
4551 PRIV(ucd_caseless_sets) + rc, oc);
4553 /* Do nothing if the other case range is within the original range. */
4555 else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
4557 /* Extend the original range if there is overlap, noting that if oc < c, we
4558 can't have od > end because a subrange is always shorter than the basic
4559 range. Otherwise, use a recursive call to add the additional range. */
4561 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4562 else if (od > end && oc <= end + 1)
4564 end = od; /* Extend upwards */
4565 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4567 else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
4571 #endif /* SUPPORT_UNICODE */
4575 for (c = start; c <= classbits_end; c++)
4577 SETBIT(classbits, cb->fcc[c]);
4582 /* Now handle the originally supplied range. Adjust the final value according
4583 to the bit length - this means that the same lists of (e.g.) horizontal spaces
4584 can be used in all cases. */
4586 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
4587 end = MAX_NON_UTF_CHAR;
4589 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
4591 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4593 for (c = start; c <= classbits_end; c++)
4595 /* Regardless of start, c will always be <= 255. */
4596 SETBIT(classbits, c);
4600 #ifdef SUPPORT_WIDE_CHARS
4601 if (start <= 0xff) start = 0xff + 1;
4605 PCRE2_UCHAR *uchardata = *uchardptr;
4607 #ifdef SUPPORT_UNICODE
4608 if ((options & PCRE2_UTF) != 0)
4612 *uchardata++ = XCL_RANGE;
4613 uchardata += PRIV(ord2utf)(start, uchardata);
4614 uchardata += PRIV(ord2utf)(end, uchardata);
4616 else if (start == end)
4618 *uchardata++ = XCL_SINGLE;
4619 uchardata += PRIV(ord2utf)(start, uchardata);
4623 #endif /* SUPPORT_UNICODE */
4625 /* Without UTF support, character values are constrained by the bit length,
4626 and can only be > 256 for 16-bit and 32-bit libraries. */
4628 #if PCRE2_CODE_UNIT_WIDTH == 8
4633 *uchardata++ = XCL_RANGE;
4634 *uchardata++ = start;
4637 else if (start == end)
4639 *uchardata++ = XCL_SINGLE;
4640 *uchardata++ = start;
4642 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
4643 *uchardptr = uchardata; /* Updata extra data pointer */
4645 #else /* SUPPORT_WIDE_CHARS */
4646 (void)uchardptr; /* Avoid compiler warning */
4647 #endif /* SUPPORT_WIDE_CHARS */
4649 return n8; /* Number of 8-bit characters */
4654 #ifdef SUPPORT_UNICODE
4655 /*************************************************
4656 * Add a list of characters to a class (internal) *
4657 *************************************************/
4659 /* This function is used for adding a list of case-equivalent characters to a
4660 class when in UTF mode. This function is called only from within
4661 add_to_class_internal(), with which it is mutually recursive.
4664 classbits the bit map for characters < 256
4665 uchardptr points to the pointer for extra data
4666 options the options word
4667 cb contains pointers to tables etc.
4668 p points to row of 32-bit values, terminated by NOTACHAR
4669 except character to omit; this is used when adding lists of
4670 case-equivalent characters to avoid including the one we
4673 Returns: the number of < 256 characters added
4674 the pointer to extra data is updated
4678 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4679 uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
4681 unsigned int n8 = 0;
4682 while (p[0] < NOTACHAR)
4687 while(p[n+1] == p[0] + n + 1) n++;
4688 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
4698 /*************************************************
4699 * External entry point for add range to class *
4700 *************************************************/
4702 /* This function sets the overall range so that the internal functions can try
4703 to avoid duplication when handling case-independence.
4706 classbits the bit map for characters < 256
4707 uchardptr points to the pointer for extra data
4708 options the options word
4710 start start of range character
4711 end end of range character
4713 Returns: the number of < 256 characters added
4714 the pointer to extra data is updated
4718 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
4719 compile_block *cb, uint32_t start, uint32_t end)
4721 cb->class_range_start = start;
4722 cb->class_range_end = end;
4723 return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
4727 /*************************************************
4728 * External entry point for add list to class *
4729 *************************************************/
4731 /* This function is used for adding a list of horizontal or vertical whitespace
4732 characters to a class. The list must be in order so that ranges of characters
4733 can be detected and handled appropriately. This function sets the overall range
4734 so that the internal functions can try to avoid duplication when handling
4738 classbits the bit map for characters < 256
4739 uchardptr points to the pointer for extra data
4740 options the options word
4741 cb contains pointers to tables etc.
4742 p points to row of 32-bit values, terminated by NOTACHAR
4743 except character to omit; this is used when adding lists of
4744 case-equivalent characters to avoid including the one we
4747 Returns: the number of < 256 characters added
4748 the pointer to extra data is updated
4752 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
4753 compile_block *cb, const uint32_t *p, unsigned int except)
4755 unsigned int n8 = 0;
4756 while (p[0] < NOTACHAR)
4761 while(p[n+1] == p[0] + n + 1) n++;
4762 cb->class_range_start = p[0];
4763 cb->class_range_end = p[n];
4764 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
4773 /*************************************************
4774 * Add characters not in a list to a class *
4775 *************************************************/
4777 /* This function is used for adding the complement of a list of horizontal or
4778 vertical whitespace to a class. The list must be in order.
4781 classbits the bit map for characters < 256
4782 uchardptr points to the pointer for extra data
4783 options the options word
4784 cb contains pointers to tables etc.
4785 p points to row of 32-bit values, terminated by NOTACHAR
4787 Returns: the number of < 256 characters added
4788 the pointer to extra data is updated
4792 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4793 uint32_t options, compile_block *cb, const uint32_t *p)
4795 BOOL utf = (options & PCRE2_UTF) != 0;
4796 unsigned int n8 = 0;
4798 n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
4799 while (p[0] < NOTACHAR)
4801 while (p[1] == p[0] + 1) p++;
4802 n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
4803 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4811 /*************************************************
4812 * Find details of duplicate group names *
4813 *************************************************/
4815 /* This is called from compile_branch() when it needs to know the index and
4816 count of duplicates in the names table when processing named backreferences,
4817 either directly, or as conditions.
4820 name points to the name
4821 length the length of the name
4822 indexptr where to put the index
4823 countptr where to put the count of duplicates
4824 errorcodeptr where to put an error code
4825 cb the compile block
4827 Returns: TRUE if OK, FALSE if not, error code set
4831 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
4832 int *countptr, int *errorcodeptr, compile_block *cb)
4834 uint32_t i, groupnumber;
4836 PCRE2_UCHAR *slot = cb->name_table;
4838 /* Find the first entry in the table */
4840 for (i = 0; i < cb->names_found; i++)
4842 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
4843 slot[IMM2_SIZE+length] == 0) break;
4844 slot += cb->name_entry_size;
4847 /* This should not occur, because this function is called only when we know we
4848 have duplicate names. Give an internal error. */
4850 if (i >= cb->names_found)
4852 *errorcodeptr = ERR53;
4853 cb->erroroffset = name - cb->start_pattern;
4857 /* Record the index and then see how many duplicates there are, updating the
4858 backref map and maximum back reference as we do. */
4866 groupnumber = GET2(slot,0);
4867 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
4868 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
4869 if (++i >= cb->names_found) break;
4870 slot += cb->name_entry_size;
4871 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
4872 (slot+IMM2_SIZE)[length] != 0) break;
4881 /*************************************************
4882 * Compile one branch *
4883 *************************************************/
4885 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
4886 the options are changed during the branch, the pointer is used to change the
4887 external options bits. This function is used during the pre-compile phase when
4888 we are trying to find out the amount of memory needed, as well as during the
4889 real compile phase. The value of lengthptr distinguishes the two phases.
4892 optionsptr pointer to the option bits
4893 codeptr points to the pointer to the current code point
4894 pptrptr points to the current parsed pattern pointer
4895 errorcodeptr points to error code variable
4896 firstcuptr place to put the first required code unit
4897 firstcuflagsptr place to put the first code unit flags, or a negative number
4898 reqcuptr place to put the last required code unit
4899 reqcuflagsptr place to put the last required code unit flags, or a negative number
4900 bcptr points to current branch chain
4901 cb contains pointers to tables etc.
4902 lengthptr NULL during the real compile phase
4903 points to length accumulator during pre-compile phase
4905 Returns: 0 There's been an error, *errorcodeptr is non-zero
4906 +1 Success, this branch must match at least one character
4907 -1 Success, this branch may match an empty string
4911 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
4912 int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
4913 uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
4914 compile_block *cb, PCRE2_SIZE *lengthptr)
4918 int group_return = 0;
4919 uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4920 uint32_t greedy_default, greedy_non_default;
4921 uint32_t repeat_type, op_type;
4922 uint32_t options = *optionsptr; /* May change dynamically */
4923 uint32_t firstcu, reqcu;
4924 uint32_t zeroreqcu, zerofirstcu;
4926 uint32_t *pptr = *pptrptr;
4927 uint32_t meta, meta_arg;
4928 int32_t firstcuflags, reqcuflags;
4929 int32_t zeroreqcuflags, zerofirstcuflags;
4930 int32_t req_caseopt, reqvary, tempreqvary;
4931 PCRE2_SIZE offset = 0;
4932 PCRE2_SIZE length_prevgroup = 0;
4933 PCRE2_UCHAR *code = *codeptr;
4934 PCRE2_UCHAR *last_code = code;
4935 PCRE2_UCHAR *orig_code = code;
4936 PCRE2_UCHAR *tempcode;
4937 PCRE2_UCHAR *previous = NULL;
4938 PCRE2_UCHAR op_previous;
4939 BOOL groupsetfirstcu = FALSE;
4940 BOOL matched_char = FALSE;
4941 BOOL previous_matched_char = FALSE;
4942 const uint8_t *cbits = cb->cbits;
4943 uint8_t classbits[32];
4945 /* We can fish out the UTF setting once and for all into a BOOL, but we must
4946 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
4947 dynamically as we process the pattern. */
4949 #ifdef SUPPORT_UNICODE
4950 BOOL utf = (options & PCRE2_UTF) != 0;
4951 #else /* No UTF support */
4955 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4956 class_uchardata always so that it can be passed to add_to_class() always,
4957 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4958 alternative calls for the different cases. */
4960 PCRE2_UCHAR *class_uchardata;
4961 #ifdef SUPPORT_WIDE_CHARS
4963 PCRE2_UCHAR *class_uchardata_base;
4966 /* Set up the default and non-default settings for greediness */
4968 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
4969 greedy_non_default = greedy_default ^ 1;
4971 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
4972 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4973 matches a non-fixed first unit; reqcu just remains unset if we never find one.
4975 When we hit a repeat whose minimum is zero, we may have to adjust these values
4976 to take the zero repeat into account. This is implemented by setting them to
4977 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
4978 item types that can be repeated set these backoff variables appropriately. */
4980 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
4981 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
4983 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
4984 according to the current setting of the caseless flag. The REQ_CASELESS value
4985 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
4986 to record the case status of the value. This is used only for ASCII characters.
4989 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
4991 /* Switch on next META item until the end of the branch */
4995 #ifdef SUPPORT_WIDE_CHARS
4996 BOOL xclass_has_prop;
4999 BOOL should_flip_negation;
5000 BOOL match_all_or_no_wide_chars;
5001 BOOL possessive_quantifier;
5002 BOOL note_group_empty;
5003 int class_has_8bitchar;
5007 uint32_t subreqcu, subfirstcu;
5008 uint32_t groupnumber;
5009 uint32_t verbarglen, verbculen;
5010 int32_t subreqcuflags, subfirstcuflags; /* Must be signed */
5012 PCRE2_UCHAR mcbuffer[8];
5014 /* Get next META item in the pattern and its potential argument. */
5016 meta = META_CODE(*pptr);
5017 meta_arg = META_DATA(*pptr);
5019 /* If we are in the pre-compile phase, accumulate the length used for the
5020 previous cycle of this loop, unless the next item is a quantifier. */
5022 if (lengthptr != NULL)
5024 if (code > cb->start_workspace + cb->workspace_size -
5025 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
5027 *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5032 /* There is at least one situation where code goes backwards: this is the
5033 case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5034 is processed, the whole class is eliminated. However, it is created first,
5035 so we have to allow memory for it. Therefore, don't ever reduce the length
5038 if (code < last_code) code = last_code;
5040 /* If the next thing is not a quantifier, we add the length of the previous
5041 item into the total, and reset the code pointer to the start of the
5042 workspace. Otherwise leave the previous item available to be quantified. */
5044 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5046 if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5048 *errorcodeptr = ERR20; /* Integer overflow */
5051 *lengthptr += (PCRE2_SIZE)(code - orig_code);
5052 if (*lengthptr > MAX_PATTERN_SIZE)
5054 *errorcodeptr = ERR20; /* Pattern is too large */
5060 /* Remember where this code item starts so we can catch the "backwards"
5061 case above next time round. */
5066 /* Process the next parsed pattern item. If it is not a quantifier, remember
5067 where it starts so that it can be quantified when a quantifier follows.
5068 Checking for the legality of quantifiers happens in parse_regex(), except for
5069 a quantifier after an assertion that is a condition. */
5071 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5074 if (matched_char) okreturn = 1;
5077 previous_matched_char = matched_char;
5078 matched_char = FALSE;
5079 note_group_empty = FALSE;
5080 skipunits = 0; /* Default value for most subgroups */
5084 /* ===================================================================*/
5085 /* The branch terminates at pattern end or | or ) */
5090 *firstcuptr = firstcu;
5091 *firstcuflagsptr = firstcuflags;
5093 *reqcuflagsptr = reqcuflags;
5099 /* ===================================================================*/
5100 /* Handle single-character metacharacters. In multiline mode, ^ disables
5101 the setting of any following char as a first character. */
5103 case META_CIRCUMFLEX:
5104 if ((options & PCRE2_MULTILINE) != 0)
5106 if (firstcuflags == REQ_UNSET)
5107 zerofirstcuflags = firstcuflags = REQ_NONE;
5110 else *code++ = OP_CIRC;
5114 *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5117 /* There can never be a first char if '.' is first, whatever happens about
5118 repeats. The value of reqcu doesn't change either. */
5121 matched_char = TRUE;
5122 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5123 zerofirstcu = firstcu;
5124 zerofirstcuflags = firstcuflags;
5126 zeroreqcuflags = reqcuflags;
5127 *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5131 /* ===================================================================*/
5132 /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5133 Otherwise, an initial ']' is taken as a data character. When empty classes
5134 are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5135 match any character, so generate OP_ALLANY. */
5137 case META_CLASS_EMPTY:
5138 case META_CLASS_EMPTY_NOT:
5139 matched_char = TRUE;
5140 *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5141 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5142 zerofirstcu = firstcu;
5143 zerofirstcuflags = firstcuflags;
5147 /* ===================================================================*/
5148 /* Non-empty character class. If the included characters are all < 256, we
5149 build a 32-byte bitmap of the permitted characters, except in the special
5150 case where there is only one such character. For negated classes, we build
5151 the map as usual, then invert it at the end. However, we use a different
5152 opcode so that data characters > 255 can be handled correctly.
5154 If the class contains characters outside the 0-255 range, a different
5155 opcode is compiled. It may optionally have a bit map for characters < 256,
5156 but those above are are explicitly listed afterwards. A flag code unit
5157 tells whether the bitmap is present, and whether this is a negated class or
5160 case META_CLASS_NOT:
5162 matched_char = TRUE;
5163 negate_class = meta == META_CLASS_NOT;
5165 /* We can optimize the case of a single character in a class by generating
5166 OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5167 negative. In the negative case there can be no first char if this item is
5168 first, whatever repeat count may follow. In the case of reqcu, save the
5169 previous value for reinstating. */
5171 /* NOTE: at present this optimization is not effective if the only
5172 character in a class in 32-bit, non-UCP mode has its top bit set. */
5174 if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5176 #ifdef SUPPORT_UNICODE
5179 uint32_t c = pptr[1];
5181 pptr += 2; /* Move on to class end */
5182 if (meta == META_CLASS) /* A positive one-char class can be */
5183 { /* handled as a normal literal character. */
5184 meta = c; /* Set up the character */
5185 goto NORMAL_CHAR_SET;
5188 /* Handle a negative one-character class */
5191 zeroreqcuflags = reqcuflags;
5192 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5193 zerofirstcu = firstcu;
5194 zerofirstcuflags = firstcuflags;
5196 /* For caseless UTF mode, check whether this character has more than
5197 one other case. If so, generate a special OP_NOTPROP item instead of
5200 #ifdef SUPPORT_UNICODE
5201 if (utf && (options & PCRE2_CASELESS) != 0 &&
5202 (d = UCD_CASESET(c)) != 0)
5204 *code++ = OP_NOTPROP;
5207 break; /* We are finished with this class */
5210 /* Char has only one other case, or UCP not available */
5212 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5213 code += PUTCHAR(c, code);
5214 break; /* We are finished with this class */
5215 } /* End of 1-char optimization */
5217 /* Handle character classes that contain more than just one literal
5220 /* If a non-extended class contains a negative special such as \S, we need
5221 to flip the negation flag at the end, so that support for characters > 255
5222 works correctly (they are all included in the class). An extended class may
5223 need to insert specific matching or non-matching code for wide characters.
5226 should_flip_negation = match_all_or_no_wide_chars = FALSE;
5228 /* Extended class (xclass) will be used when characters > 255
5231 #ifdef SUPPORT_WIDE_CHARS
5233 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
5234 class_uchardata_base = class_uchardata; /* Save the start */
5237 /* For optimization purposes, we track some properties of the class:
5238 class_has_8bitchar will be non-zero if the class contains at least one
5239 character with a code point less than 256; xclass_has_prop will be TRUE if
5240 Unicode property checks are present in the class. */
5242 class_has_8bitchar = 0;
5243 #ifdef SUPPORT_WIDE_CHARS
5244 xclass_has_prop = FALSE;
5247 /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
5248 in a temporary bit of memory, in case the class contains fewer than two
5249 8-bit characters because in that case the compiled code doesn't use the bit
5252 memset(classbits, 0, 32 * sizeof(uint8_t));
5254 /* Process items until META_CLASS_END is reached. */
5256 while ((meta = *(++pptr)) != META_CLASS_END)
5258 /* Handle POSIX classes such as [:alpha:] etc. */
5260 if (meta == META_POSIX || meta == META_POSIX_NEG)
5262 BOOL local_negate = (meta == META_POSIX_NEG);
5263 int posix_class = *(++pptr);
5264 int taboffset, tabopt;
5267 should_flip_negation = local_negate; /* Note negative special */
5269 /* If matching is caseless, upper and lower are converted to alpha.
5270 This relies on the fact that the class table starts with alpha,
5271 lower, upper as the first 3 entries. */
5273 if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
5276 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
5277 different escape sequences that use Unicode properties \p or \P.
5278 Others that are not available via \p or \P have to generate
5279 XCL_PROP/XCL_NOTPROP directly, which is done here. */
5281 #ifdef SUPPORT_UNICODE
5282 if ((options & PCRE2_UCP) != 0) switch(posix_class)
5287 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5288 *class_uchardata++ = (PCRE2_UCHAR)
5289 ((posix_class == PC_GRAPH)? PT_PXGRAPH :
5290 (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
5291 *class_uchardata++ = 0;
5292 xclass_has_prop = TRUE;
5293 goto CONTINUE_CLASS;
5295 /* For the other POSIX classes (ascii, xdigit) we are going to
5296 fall through to the non-UCP case and build a bit map for
5297 characters with code points less than 256. However, if we are in
5298 a negated POSIX class, characters with code points greater than
5299 255 must either all match or all not match, depending on whether
5300 the whole class is not or is negated. For example, for
5301 [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
5304 In the special case where there are no xclass items, this is
5305 automatically handled by the use of OP_CLASS or OP_NCLASS, but an
5306 explicit range is needed for OP_XCLASS. Setting a flag here
5307 causes the range to be generated later when it is known that
5308 OP_XCLASS is required. In the 8-bit library this is relevant only in
5309 utf mode, since no wide characters can exist otherwise. */
5312 #if PCRE2_CODE_UNIT_WIDTH == 8
5315 match_all_or_no_wide_chars |= local_negate;
5318 #endif /* SUPPORT_UNICODE */
5320 /* In the non-UCP case, or when UCP makes no difference, we build the
5321 bit map for the POSIX class in a chunk of local store because we may
5322 be adding and subtracting from it, and we don't want to subtract bits
5323 that may be in the main map already. At the end we or the result into
5324 the bit map that is being built. */
5328 /* Copy in the first table (always present) */
5330 memcpy(pbits, cbits + posix_class_maps[posix_class],
5331 32 * sizeof(uint8_t));
5333 /* If there is a second table, add or remove it as required. */
5335 taboffset = posix_class_maps[posix_class + 1];
5336 tabopt = posix_class_maps[posix_class + 2];
5341 for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
5343 for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
5346 /* Now see if we need to remove any special characters. An option
5347 value of 1 removes vertical space and 2 removes underscore. */
5349 if (tabopt < 0) tabopt = -tabopt;
5350 if (tabopt == 1) pbits[1] &= ~0x3c;
5351 else if (tabopt == 2) pbits[11] &= 0x7f;
5353 /* Add the POSIX table or its complement into the main table that is
5354 being built and we are done. */
5357 for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i];
5359 for (i = 0; i < 32; i++) classbits[i] |= pbits[i];
5361 /* Every class contains at least one < 256 character. */
5363 class_has_8bitchar = 1;
5364 goto CONTINUE_CLASS; /* End of POSIX handling */
5367 /* Other than POSIX classes, the only items we should encounter are
5368 \d-type escapes and literal characters (possibly as ranges). */
5370 if (meta == META_BIGVALUE)
5376 /* Any other non-literal must be an escape */
5378 if (meta >= META_END)
5380 if (META_CODE(meta) != META_ESCAPE)
5382 #ifdef DEBUG_SHOW_PARSED
5383 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
5384 "in character class\n", meta);
5386 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
5389 escape = META_DATA(meta);
5391 /* Every class contains at least one < 256 character. */
5393 class_has_8bitchar++;
5398 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
5402 should_flip_negation = TRUE;
5403 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit];
5407 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
5411 should_flip_negation = TRUE;
5412 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word];
5415 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5416 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5417 previously set by something earlier in the character class.
5418 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5419 we could just adjust the appropriate bit. From PCRE 8.34 we no
5420 longer treat \s and \S specially. */
5423 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
5427 should_flip_negation = TRUE;
5428 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
5431 /* When adding the horizontal or vertical space lists to a class, or
5432 their complements, disable PCRE2_CASELESS, because it justs wastes
5433 time, and in the "not-x" UTF cases can create unwanted duplicates in
5434 the XCLASS list (provoked by characters that have more than one other
5435 case and by both cases being in the same "not-x" sublist). */
5438 (void)add_list_to_class(classbits, &class_uchardata,
5439 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
5443 (void)add_not_list_to_class(classbits, &class_uchardata,
5444 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
5448 (void)add_list_to_class(classbits, &class_uchardata,
5449 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
5453 (void)add_not_list_to_class(classbits, &class_uchardata,
5454 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
5457 /* If Unicode is not supported, \P and \p are not allowed and are
5458 faulted at parse time, so will never appear here. */
5460 #ifdef SUPPORT_UNICODE
5464 uint32_t ptype = *(++pptr) >> 16;
5465 uint32_t pdata = *pptr & 0xffff;
5466 *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
5467 *class_uchardata++ = ptype;
5468 *class_uchardata++ = pdata;
5469 xclass_has_prop = TRUE;
5470 class_has_8bitchar--; /* Undo! */
5476 goto CONTINUE_CLASS;
5477 } /* End handling \d-type escapes */
5479 /* A literal character may be followed by a range meta. At parse time
5480 there are checks for out-of-order characters, for ranges where the two
5481 characters are equal, and for hyphens that cannot indicate a range. At
5482 this point, therefore, no checking is needed. */
5491 /* Remember if \r or \n were explicitly used */
5493 if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5495 /* Process a character range */
5497 if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
5500 BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
5504 if (d == META_BIGVALUE) d = *(++pptr);
5506 /* Remember an explicit \r or \n, and add the range to the class. */
5508 if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5510 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
5511 because there are holes in the encoding, and simply using the range
5512 A-Z (for example) would include the characters in the holes. This
5513 applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
5516 if (range_is_literal &&
5517 (cb->ctypes[c] & ctype_letter) != 0 &&
5518 (cb->ctypes[d] & ctype_letter) != 0 &&
5519 (d <= CHAR_z) == (d <= CHAR_z))
5521 uint32_t uc = (d <= CHAR_z)? 0 : 64;
5522 uint32_t C = d - uc;
5523 uint32_t D = d - uc;
5527 class_has_8bitchar +=
5528 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5529 ((D < CHAR_i)? D : CHAR_i) + uc);
5533 if (C <= D && C <= CHAR_r)
5535 class_has_8bitchar +=
5536 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5537 ((D < CHAR_r)? D : CHAR_r) + uc);
5543 class_has_8bitchar +=
5544 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5550 /* Not an EBCDIC special range */
5552 class_has_8bitchar +=
5553 add_to_class(classbits, &class_uchardata, options, cb, c, d);
5554 goto CONTINUE_CLASS; /* Go get the next char in the class */
5555 } /* End of range handling */
5558 /* Handle a single character. */
5560 class_has_8bitchar +=
5561 add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
5564 /* Continue to the next item in the class. */
5568 #ifdef SUPPORT_WIDE_CHARS
5569 /* If any wide characters or Unicode properties have been encountered,
5570 set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
5571 of the extra data and reset the pointer. This is so that very large
5572 classes that contain a zillion wide characters or Unicode property tests
5573 do not overwrite the workspace (which is on the stack). */
5575 if (class_uchardata > class_uchardata_base)
5578 if (lengthptr != NULL)
5580 *lengthptr += class_uchardata - class_uchardata_base;
5581 class_uchardata = class_uchardata_base;
5586 continue; /* Needed to avoid error when not supporting wide chars */
5587 } /* End of main class-processing loop */
5589 /* If this class is the first thing in the branch, there can be no first
5590 char setting, whatever the repeat count. Any reqcu setting must remain
5591 unchanged after any kind of repeat. */
5593 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5594 zerofirstcu = firstcu;
5595 zerofirstcuflags = firstcuflags;
5597 zeroreqcuflags = reqcuflags;
5599 /* If there are characters with values > 255, or Unicode property settings
5600 (\p or \P), we have to compile an extended class, with its own opcode,
5601 unless there were no property settings and there was a negated special such
5602 as \S in the class, and PCRE2_UCP is not set, because in that case all
5603 characters > 255 are in or not in the class, so any that were explicitly
5604 given as well can be ignored.
5606 In the UCP case, if certain negated POSIX classes ([:^ascii:] or
5607 [^:xdigit:]) were present in a class, we either have to match or not match
5608 all wide characters (depending on whether the whole class is or is not
5609 negated). This requirement is indicated by match_all_or_no_wide_chars being
5610 true. We do this by including an explicit range, which works in both cases.
5611 This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
5612 cannot be any wide characters in 8-bit non-UTF mode.
5614 When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
5615 class where \S etc is present without PCRE2_UCP, causing an extended class
5616 to be compiled, we make sure that all characters > 255 are included by
5617 forcing match_all_or_no_wide_chars to be true.
5619 If, when generating an xclass, there are no characters < 256, we can omit
5620 the bitmap in the actual compiled code. */
5622 #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
5624 #ifdef SUPPORT_UNICODE
5625 (options & PCRE2_UCP) != 0 ||
5627 xclass_has_prop || !should_flip_negation))
5629 if (match_all_or_no_wide_chars || (
5630 #if PCRE2_CODE_UNIT_WIDTH == 8
5633 should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
5635 *class_uchardata++ = XCL_RANGE;
5636 if (utf) /* Will always be utf in the 8-bit library */
5638 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5639 class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
5641 else /* Can only happen for the 16-bit & 32-bit libraries */
5643 #if PCRE2_CODE_UNIT_WIDTH == 16
5644 *class_uchardata++ = 0x100;
5645 *class_uchardata++ = 0xffffu;
5646 #elif PCRE2_CODE_UNIT_WIDTH == 32
5647 *class_uchardata++ = 0x100;
5648 *class_uchardata++ = 0xffffffffu;
5652 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5653 *code++ = OP_XCLASS;
5655 *code = negate_class? XCL_NOT:0;
5656 if (xclass_has_prop) *code |= XCL_HASPROP;
5658 /* If the map is required, move up the extra data to make room for it;
5659 otherwise just move the code pointer to the end of the extra data. */
5661 if (class_has_8bitchar > 0)
5664 (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
5665 CU2BYTES(class_uchardata - code));
5666 if (negate_class && !xclass_has_prop)
5667 for (i = 0; i < 32; i++) classbits[i] = ~classbits[i];
5668 memcpy(code, classbits, 32);
5669 code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
5671 else code = class_uchardata;
5673 /* Now fill in the complete length of the item */
5675 PUT(previous, 1, (int)(code - previous));
5676 break; /* End of class handling */
5678 #endif /* SUPPORT_WIDE_CHARS */
5680 /* If there are no characters > 255, or they are all to be included or
5681 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5682 whole class was negated and whether there were negative specials such as \S
5683 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5684 negating it if necessary. */
5686 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5687 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5690 for (i = 0; i < 32; i++) classbits[i] = ~classbits[i];
5691 memcpy(code, classbits, 32);
5693 code += 32 / sizeof(PCRE2_UCHAR);
5694 break; /* End of class processing */
5697 /* ===================================================================*/
5698 /* Deal with (*VERB)s. */
5700 /* Check for open captures before ACCEPT and close those that are within
5701 the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
5702 assertion. In the first pass, just accumulate the length required;
5703 otherwise hitting (*ACCEPT) inside many nested parentheses can cause
5704 workspace overflow. Do not set firstcu after *ACCEPT. */
5707 cb->had_accept = TRUE;
5708 for (oc = cb->open_caps;
5709 oc != NULL && oc->assert_depth >= cb->assert_depth;
5712 if (lengthptr != NULL)
5714 *lengthptr += CU2BYTES(1) + IMM2_SIZE;
5719 PUT2INC(code, 0, oc->number);
5722 *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5723 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5728 cb->had_pruneorskip = TRUE;
5732 *code++ = verbops[(meta - META_MARK) >> 16];
5736 cb->external_flags |= PCRE2_HASTHEN;
5740 /* Handle verbs with arguments. Arguments can be very long, especially in
5741 16- and 32-bit modes, and can overflow the workspace in the first pass.
5742 However, the argument length is constrained to be small enough to fit in
5743 one code unit. This check happens in parse_regex(). In the first pass,
5744 instead of putting the argument into memory, we just update the length
5745 counter and set up an empty argument. */
5748 cb->external_flags |= PCRE2_HASTHEN;
5751 case META_PRUNE_ARG:
5753 cb->had_pruneorskip = TRUE;
5756 case META_COMMIT_ARG:
5758 *code++ = verbops[(meta - META_MARK) >> 16];
5759 /* The length is in characters. */
5760 verbarglen = *(++pptr);
5763 for (i = 0; i < (int)verbarglen; i++)
5766 #ifdef SUPPORT_UNICODE
5767 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
5773 if (lengthptr != NULL) *lengthptr += mclength; else
5775 memcpy(code, mcbuffer, CU2BYTES(mclength));
5777 verbculen += mclength;
5781 *tempcode = verbculen; /* Fill in the code unit length */
5782 *code++ = 0; /* Terminating zero */
5786 /* ===================================================================*/
5787 /* Handle options change. The new setting must be passed back for use in
5788 subsequent branches. Reset the greedy defaults and the case value for
5789 firstcu and reqcu. */
5792 *optionsptr = options = *(++pptr);
5793 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5794 greedy_non_default = greedy_default ^ 1;
5795 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
5799 /* ===================================================================*/
5800 /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
5801 because it could be a numerical check on recursion, or a name check on a
5802 group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
5803 we can handle it either way. We first try for a name; if not found, process
5806 case META_COND_RNUMBER: /* (?(Rdigits) */
5807 case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
5808 case META_COND_RNAME: /* (?(R&name) - test for recursion */
5813 named_group *ng = cb->named_groups;
5814 uint32_t length = *(++pptr);
5816 GETPLUSOFFSET(offset, pptr);
5817 name = cb->start_pattern + offset;
5819 /* In the first pass, the names generated in the pre-pass are available,
5820 but the main name table has not yet been created. Scan the list of names
5821 generated in the pre-pass in order to get a number and whether or not
5822 this name is duplicated. If it is not duplicated, we can handle it as a
5825 for (i = 0; i < cb->names_found; i++, ng++)
5827 if (length == ng->length &&
5828 PRIV(strncmp)(name, ng->name, length) == 0)
5832 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
5833 PUT2(code, 2+LINK_SIZE, ng->number);
5834 if (ng->number > cb->top_backref) cb->top_backref = ng->number;
5835 skipunits = 1+IMM2_SIZE;
5836 goto GROUP_PROCESS_NOTE_EMPTY;
5838 break; /* Found a duplicated name */
5842 /* If the name was not found we have a bad reference, unless we are
5843 dealing with R<digits>, which is treated as a recursion test by number.
5846 if (i >= cb->names_found)
5849 if (meta == META_COND_RNUMBER)
5851 for (i = 1; i < (int)length; i++)
5853 groupnumber = groupnumber * 10 + name[i] - CHAR_0;
5854 if (groupnumber > MAX_GROUP_NUMBER)
5856 *errorcodeptr = ERR61;
5857 cb->erroroffset = offset + i;
5863 if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
5865 *errorcodeptr = ERR15;
5866 cb->erroroffset = offset;
5870 /* (?Rdigits) treated as a recursion reference by number. A value of
5871 zero (which is the result of both (?R) and (?R0)) means "any", and is
5872 translated into RREF_ANY (which is 0xffff). */
5874 if (groupnumber == 0) groupnumber = RREF_ANY;
5875 code[1+LINK_SIZE] = OP_RREF;
5876 PUT2(code, 2+LINK_SIZE, groupnumber);
5877 skipunits = 1+IMM2_SIZE;
5878 goto GROUP_PROCESS_NOTE_EMPTY;
5881 /* A duplicated name was found. Note that if an R<digits> name is found
5882 (META_COND_RNUMBER), it is a reference test, not a recursion test. */
5884 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
5886 /* We have a duplicated name. In the compile pass we have to search the
5887 main table in order to get the index and count values. */
5889 count = 0; /* Values for first pass (avoids compiler warning) */
5891 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
5892 &count, errorcodeptr, cb)) return 0;
5894 /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
5895 insert appropriate data values. */
5897 code[1+LINK_SIZE]++;
5898 skipunits = 1+2*IMM2_SIZE;
5899 PUT2(code, 2+LINK_SIZE, index);
5900 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
5902 goto GROUP_PROCESS_NOTE_EMPTY;
5904 /* The DEFINE condition is always false. It's internal groups may never
5905 be called, so matched_char must remain false, hence the jump to
5906 GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
5908 case META_COND_DEFINE:
5910 GETPLUSOFFSET(offset, pptr);
5911 code[1+LINK_SIZE] = OP_DEFINE;
5915 /* Conditional test of a group's being set. */
5917 case META_COND_NUMBER:
5919 GETPLUSOFFSET(offset, pptr);
5920 groupnumber = *(++pptr);
5921 if (groupnumber > cb->bracount)
5923 *errorcodeptr = ERR15;
5924 cb->erroroffset = offset;
5927 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5928 offset -= 2; /* Point at initial ( for too many branches error */
5929 code[1+LINK_SIZE] = OP_CREF;
5930 skipunits = 1+IMM2_SIZE;
5931 PUT2(code, 2+LINK_SIZE, groupnumber);
5932 goto GROUP_PROCESS_NOTE_EMPTY;
5934 /* Test for the PCRE2 version. */
5936 case META_COND_VERSION:
5939 code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
5940 (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
5943 code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
5947 goto GROUP_PROCESS_NOTE_EMPTY;
5949 /* The condition is an assertion, possibly preceded by a callout. */
5951 case META_COND_ASSERT:
5953 goto GROUP_PROCESS_NOTE_EMPTY;
5956 /* ===================================================================*/
5957 /* Handle all kinds of nested bracketed groups. The non-capturing,
5958 non-conditional cases are here; others come to GROUP_PROCESS via goto. */
5960 case META_LOOKAHEAD:
5961 bravalue = OP_ASSERT;
5962 cb->assert_depth += 1;
5965 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
5966 thing to do, but Perl allows all assertions to be quantified, and when
5967 they contain capturing parentheses there may be a potential use for
5968 this feature. Not that that applies to a quantified (?!) but we allow
5969 it for uniformity. */
5971 case META_LOOKAHEADNOT:
5972 if (pptr[1] == META_KET &&
5973 (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
5980 bravalue = OP_ASSERT_NOT;
5981 cb->assert_depth += 1;
5986 case META_LOOKBEHIND:
5987 bravalue = OP_ASSERTBACK;
5988 cb->assert_depth += 1;
5991 case META_LOOKBEHINDNOT:
5992 bravalue = OP_ASSERTBACK_NOT;
5993 cb->assert_depth += 1;
5998 goto GROUP_PROCESS_NOTE_EMPTY;
6000 case META_NOCAPTURE:
6004 /* Process nested bracketed regex. The nesting depth is maintained for the
6005 benefit of the stackguard function. The test for too deep nesting is now
6006 done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6007 others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6008 note of whether or not they may match an empty string. */
6010 GROUP_PROCESS_NOTE_EMPTY:
6011 note_group_empty = TRUE;
6014 cb->parens_depth += 1;
6018 tempreqvary = cb->req_varyopt; /* Save value before group */
6019 length_prevgroup = 0; /* Initialize for pre-compile phase */
6023 options, /* The option state */
6024 &tempcode, /* Where to put code (updated) */
6025 &pptr, /* Input pointer (updated) */
6026 errorcodeptr, /* Where to put an error message */
6027 skipunits, /* Skip over bracket number */
6028 &subfirstcu, /* For possible first char */
6030 &subreqcu, /* For possible last char */
6032 bcptr, /* Current branch chain */
6033 cb, /* Compile data block */
6034 (lengthptr == NULL)? NULL : /* Actual compile phase */
6035 &length_prevgroup /* Pre-compile phase */
6037 return 0; /* Error */
6039 cb->parens_depth -= 1;
6041 /* If that was a non-conditional significant group (not an assertion, not a
6042 DEFINE) that matches at least one character, then the current item matches
6043 a character. Conditionals are handled below. */
6045 if (note_group_empty && bravalue != OP_COND && group_return > 0)
6046 matched_char = TRUE;
6048 /* If we've just compiled an assertion, pop the assert depth. */
6050 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
6051 cb->assert_depth -= 1;
6053 /* At the end of compiling, code is still pointing to the start of the
6054 group, while tempcode has been updated to point past the end of the group.
6055 The parsed pattern pointer (pptr) is on the closing META_KET.
6057 If this is a conditional bracket, check that there are no more than
6058 two branches in the group, or just one if it's a DEFINE group. We do this
6059 in the real compile phase, not in the pre-pass, where the whole group may
6060 not be available. */
6062 if (bravalue == OP_COND && lengthptr == NULL)
6064 PCRE2_UCHAR *tc = code;
6071 while (*tc != OP_KET);
6073 /* A DEFINE group is never obeyed inline (the "condition" is always
6074 false). It must have only one branch. Having checked this, change the
6075 opcode to OP_FALSE. */
6077 if (code[LINK_SIZE+1] == OP_DEFINE)
6081 cb->erroroffset = offset;
6082 *errorcodeptr = ERR54;
6085 code[LINK_SIZE+1] = OP_FALSE;
6086 bravalue = OP_DEFINE; /* A flag to suppress char handling below */
6089 /* A "normal" conditional group. If there is just one branch, we must not
6090 make use of its firstcu or reqcu, because this is equivalent to an
6091 empty second branch. Also, it may match an empty string. If there are two
6092 branches, this item must match a character if the group must. */
6098 cb->erroroffset = offset;
6099 *errorcodeptr = ERR27;
6102 if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6103 else if (group_return > 0) matched_char = TRUE;
6107 /* In the pre-compile phase, update the length by the length of the group,
6108 less the brackets at either end. Then reduce the compiled code to just a
6109 set of non-capturing brackets so that it doesn't use much memory if it is
6110 duplicated by a quantifier.*/
6112 if (lengthptr != NULL)
6114 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6116 *errorcodeptr = ERR20;
6119 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6120 code++; /* This already contains bravalue */
6121 PUTINC(code, 0, 1 + LINK_SIZE);
6123 PUTINC(code, 0, 1 + LINK_SIZE);
6124 break; /* No need to waste time with special character handling */
6127 /* Otherwise update the main code pointer to the end of the group. */
6131 /* For a DEFINE group, required and first character settings are not
6134 if (bravalue == OP_DEFINE) break;
6136 /* Handle updating of the required and first code units for other types of
6137 group. Update for normal brackets of all kinds, and conditions with two
6138 branches (see code above). If the bracket is followed by a quantifier with
6139 zero repeat, we have to back off. Hence the definition of zeroreqcu and
6140 zerofirstcu outside the main loop so that they can be accessed for the back
6144 zeroreqcuflags = reqcuflags;
6145 zerofirstcu = firstcu;
6146 zerofirstcuflags = firstcuflags;
6147 groupsetfirstcu = FALSE;
6149 if (bravalue >= OP_ONCE) /* Not an assertion */
6151 /* If we have not yet set a firstcu in this branch, take it from the
6152 subpattern, remembering that it was set here so that a repeat of more
6153 than one can replicate it as reqcu if necessary. If the subpattern has
6154 no firstcu, set "none" for the whole branch. In both cases, a zero
6155 repeat forces firstcu to "none". */
6157 if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6159 if (subfirstcuflags >= 0)
6161 firstcu = subfirstcu;
6162 firstcuflags = subfirstcuflags;
6163 groupsetfirstcu = TRUE;
6165 else firstcuflags = REQ_NONE;
6166 zerofirstcuflags = REQ_NONE;
6169 /* If firstcu was previously set, convert the subpattern's firstcu
6170 into reqcu if there wasn't one, using the vary flag that was in
6171 existence beforehand. */
6173 else if (subfirstcuflags >= 0 && subreqcuflags < 0)
6175 subreqcu = subfirstcu;
6176 subreqcuflags = subfirstcuflags | tempreqvary;
6179 /* If the subpattern set a required code unit (or set a first code unit
6180 that isn't really the first code unit - see above), set it. */
6182 if (subreqcuflags >= 0)
6185 reqcuflags = subreqcuflags;
6189 /* For a forward assertion, we take the reqcu, if set, provided that the
6190 group has also set a firstcu. This can be helpful if the pattern that
6191 follows the assertion doesn't set a different char. For example, it's
6192 useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6193 because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6194 the "real" "a" would then become a reqcu instead of a firstcu. This is
6195 overcome by a scan at the end if there's no firstcu, looking for an
6196 asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6197 we must only take the reqcu when the group also set a firstcu. Otherwise,
6198 in that example, 'X' ends up set for both. */
6200 else if (bravalue == OP_ASSERT && subreqcuflags >= 0 &&
6201 subfirstcuflags >= 0)
6204 reqcuflags = subreqcuflags;
6207 break; /* End of nested group handling */
6210 /* ===================================================================*/
6211 /* Handle named backreferences and recursions. */
6213 case META_BACKREF_BYNAME:
6214 case META_RECURSE_BYNAME:
6218 BOOL is_dupname = FALSE;
6219 named_group *ng = cb->named_groups;
6220 uint32_t length = *(++pptr);
6222 GETPLUSOFFSET(offset, pptr);
6223 name = cb->start_pattern + offset;
6225 /* In the first pass, the names generated in the pre-pass are available,
6226 but the main name table has not yet been created. Scan the list of names
6227 generated in the pre-pass in order to get a number and whether or not
6228 this name is duplicated. */
6231 for (i = 0; i < cb->names_found; i++, ng++)
6233 if (length == ng->length &&
6234 PRIV(strncmp)(name, ng->name, length) == 0)
6236 is_dupname = ng->isdup;
6237 groupnumber = ng->number;
6239 /* For a recursion, that's all that is needed. We can now go to
6240 the code above that handles numerical recursion, applying it to
6241 the first group with the given name. */
6243 if (meta == META_RECURSE_BYNAME)
6245 meta_arg = groupnumber;
6246 goto HANDLE_NUMERICAL_RECURSION;
6249 /* For a back reference, update the back reference map and the
6250 maximum back reference. Then, for each group, we must check to
6251 see if it is recursive, that is, it is inside the group that it
6252 references. A flag is set so that the group can be made atomic.
6255 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6256 if (groupnumber > cb->top_backref)
6257 cb->top_backref = groupnumber;
6259 for (oc = cb->open_caps; oc != NULL; oc = oc->next)
6261 if (oc->number == groupnumber)
6270 /* If the name was not found we have a bad reference. */
6272 if (groupnumber == 0)
6274 *errorcodeptr = ERR15;
6275 cb->erroroffset = offset;
6279 /* If a back reference name is not duplicated, we can handle it as
6280 a numerical reference. */
6284 meta_arg = groupnumber;
6285 goto HANDLE_SINGLE_REFERENCE;
6288 /* If a back reference name is duplicated, we generate a different
6289 opcode to a numerical back reference. In the second pass we must
6290 search for the index and count in the final name table. */
6292 count = 0; /* Values for first pass (avoids compiler warning) */
6294 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6295 &count, errorcodeptr, cb)) return 0;
6297 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6298 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6299 PUT2INC(code, 0, index);
6300 PUT2INC(code, 0, count);
6305 /* ===================================================================*/
6306 /* Handle a numerical callout. */
6308 case META_CALLOUT_NUMBER:
6309 code[0] = OP_CALLOUT;
6310 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6311 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6312 code[1 + 2*LINK_SIZE] = pptr[3];
6314 code += PRIV(OP_lengths)[OP_CALLOUT];
6318 /* ===================================================================*/
6319 /* Handle a callout with a string argument. In the pre-pass we just compute
6320 the length without generating anything. The length in pptr[3] includes both
6321 delimiters; in the actual compile only the first one is copied, but a
6322 terminating zero is added. Any doubled delimiters within the string make
6323 this an overestimate, but it is not worth bothering about. */
6325 case META_CALLOUT_STRING:
6326 if (lengthptr != NULL)
6328 *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
6333 /* In the real compile we can copy the string. The starting delimiter is
6334 included so that the client can discover it if they want. We also pass the
6335 start offset to help a script language give better error messages. */
6341 uint32_t length = pptr[3];
6342 PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6344 code[0] = OP_CALLOUT_STR;
6345 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6346 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6349 GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */
6350 pp = cb->start_pattern + offset;
6351 delimiter = *callout_string++ = *pp++;
6352 if (delimiter == CHAR_LEFT_CURLY_BRACKET)
6353 delimiter = CHAR_RIGHT_CURLY_BRACKET;
6354 PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */
6356 /* The syntax of the pattern was checked in the parsing scan. The length
6357 includes both delimiters, but we have passed the opening one just above,
6358 so we reduce length before testing it. The test is for > 1 because we do
6359 not want to copy the final delimiter. This also ensures that pp[1] is
6362 while (--length > 1)
6364 if (*pp == delimiter && pp[1] == delimiter)
6366 *callout_string++ = delimiter;
6370 else *callout_string++ = *pp++;
6372 *callout_string++ = CHAR_NUL;
6374 /* Set the length of the entire item, the advance to its end. */
6376 PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
6377 code = callout_string;
6382 /* ===================================================================*/
6383 /* Handle repetition. The different types are all sorted out in the parsing
6386 case META_MINMAX_PLUS:
6387 case META_MINMAX_QUERY:
6389 repeat_min = *(++pptr);
6390 repeat_max = *(++pptr);
6394 case META_ASTERISK_PLUS:
6395 case META_ASTERISK_QUERY:
6397 repeat_max = REPEAT_UNLIMITED;
6401 case META_PLUS_PLUS:
6402 case META_PLUS_QUERY:
6404 repeat_max = REPEAT_UNLIMITED;
6408 case META_QUERY_PLUS:
6409 case META_QUERY_QUERY:
6414 if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
6416 /* Remember whether this is a variable length repeat, and default to
6417 single-char opcodes. */
6419 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
6422 /* If the repeat is {1} we can ignore it. */
6424 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6426 /* Adjust first and required code units for a zero repeat. */
6428 if (repeat_min == 0)
6430 firstcu = zerofirstcu;
6431 firstcuflags = zerofirstcuflags;
6433 reqcuflags = zeroreqcuflags;
6436 /* Note the greediness and possessiveness. */
6440 case META_MINMAX_PLUS:
6441 case META_ASTERISK_PLUS:
6442 case META_PLUS_PLUS:
6443 case META_QUERY_PLUS:
6444 repeat_type = 0; /* Force greedy */
6445 possessive_quantifier = TRUE;
6448 case META_MINMAX_QUERY:
6449 case META_ASTERISK_QUERY:
6450 case META_PLUS_QUERY:
6451 case META_QUERY_QUERY:
6452 repeat_type = greedy_non_default;
6453 possessive_quantifier = FALSE;
6457 repeat_type = greedy_default;
6458 possessive_quantifier = FALSE;
6462 /* Save start of previous item, in case we have to move it up in order to
6463 insert something before it, and remember what it was. */
6465 tempcode = previous;
6466 op_previous = *previous;
6468 /* Now handle repetition for the different types of item. */
6470 switch (op_previous)
6472 /* If previous was a character or negated character match, abolish the
6473 item and generate a repeat item instead. If a char item has a minimum of
6474 more than one, ensure that it is set in reqcu - it might not be if a
6475 sequence such as x{3} is the first thing in a branch because the x will
6476 have gone into firstcu instead. */
6482 op_type = chartypeoffset[op_previous - OP_CHAR];
6484 /* Deal with UTF characters that take up more than one code unit. */
6486 #ifdef MAYBE_UTF_MULTI
6487 if (utf && NOT_FIRSTCU(code[-1]))
6489 PCRE2_UCHAR *lastchar = code - 1;
6491 mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
6492 memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
6495 #endif /* MAYBE_UTF_MULTI */
6497 /* Handle the case of a single code unit - either with no UTF support, or
6498 with UTF disabled, or for a single-code-unit UTF character. */
6500 mcbuffer[0] = code[-1];
6502 if (op_previous <= OP_CHARI && repeat_min > 1)
6504 reqcu = mcbuffer[0];
6505 reqcuflags = req_caseopt | cb->req_varyopt;
6508 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
6510 /* If previous was a character class or a back reference, we put the
6511 repeat stuff after it, but just skip the item if the repeat was {0,0}. */
6513 #ifdef SUPPORT_WIDE_CHARS
6523 if (repeat_max == 0)
6529 if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
6530 *code++ = OP_CRSTAR + repeat_type;
6531 else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
6532 *code++ = OP_CRPLUS + repeat_type;
6533 else if (repeat_min == 0 && repeat_max == 1)
6534 *code++ = OP_CRQUERY + repeat_type;
6537 *code++ = OP_CRRANGE + repeat_type;
6538 PUT2INC(code, 0, repeat_min);
6539 if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */
6540 PUT2INC(code, 0, repeat_max);
6544 /* If previous is OP_FAIL, it was generated by an empty class []
6545 (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
6546 generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
6547 time. We can just ignore this repeat. */
6552 /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
6553 because pcre2_match() could not handle backtracking into recursively
6554 called groups. Now that this backtracking is available, we no longer need
6555 to do this. However, we still need to replicate recursions as we do for
6556 groups so as to have independent backtracking points. We can replicate
6557 for the minimum number of repeats directly. For optional repeats we now
6558 wrap the recursion in OP_BRA brackets and make use of the bracket
6563 /* Generate unwrapped repeats for a non-zero minimum, except when the
6564 minimum is 1 and the maximum unlimited, because that can be handled with
6565 OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
6566 minimum, we just need to generate the appropriate additional copies.
6567 Otherwise we need to generate one more, to simulate the situation when
6568 the minimum is zero. */
6570 if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
6572 int replicate = repeat_min;
6573 if (repeat_min == repeat_max) replicate--;
6575 /* In the pre-compile phase, we don't actually do the replication. We
6576 just adjust the length as if we had. Do some paranoid checks for
6577 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6578 integer type when available, otherwise double. */
6580 if (lengthptr != NULL)
6582 PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
6583 if ((INT64_OR_DOUBLE)replicate*
6584 (INT64_OR_DOUBLE)(1 + LINK_SIZE) >
6585 (INT64_OR_DOUBLE)INT_MAX ||
6586 OFLOW_MAX - *lengthptr < delta)
6588 *errorcodeptr = ERR20;
6591 *lengthptr += delta;
6594 else for (i = 0; i < replicate; i++)
6596 memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
6598 code += 1 + LINK_SIZE;
6601 /* If the number of repeats is fixed, we are done. Otherwise, adjust
6602 the counts and fall through. */
6604 if (repeat_min == repeat_max) break;
6605 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
6609 /* Wrap the recursion call in OP_BRA brackets. */
6611 (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
6612 op_previous = *previous = OP_BRA;
6613 PUT(previous, 1, 2 + 2*LINK_SIZE);
6614 previous[2 + 2*LINK_SIZE] = OP_KET;
6615 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
6616 code += 2 + 2 * LINK_SIZE;
6617 length_prevgroup = 3 + 3*LINK_SIZE;
6618 group_return = -1; /* Set "may match empty string" */
6620 /* Now treat as a repeated OP_BRA. */
6623 /* If previous was a bracket group, we may have to replicate it in
6624 certain cases. Note that at this point we can encounter only the "basic"
6625 bracket opcodes such as BRA and CBRA, as this is the place where they get
6626 converted into the more special varieties such as BRAPOS and SBRA.
6627 Originally, PCRE did not allow repetition of assertions, but now it does,
6628 for Perl compatibility. */
6633 case OP_ASSERTBACK_NOT:
6639 int len = (int)(code - previous);
6640 PCRE2_UCHAR *bralink = NULL;
6641 PCRE2_UCHAR *brazeroptr = NULL;
6643 /* Repeating a DEFINE group (or any group where the condition is always
6644 FALSE and there is only one branch) is pointless, but Perl allows the
6645 syntax, so we just ignore the repeat. */
6647 if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
6648 previous[GET(previous, 1)] != OP_ALT)
6651 /* There is no sense in actually repeating assertions. The only
6652 potential use of repetition is in cases when the assertion is optional.
6653 Therefore, if the minimum is greater than zero, just ignore the repeat.
6654 If the maximum is not zero or one, set it to 1. */
6656 if (op_previous < OP_ONCE) /* Assertion */
6658 if (repeat_min > 0) goto END_REPEAT;
6659 if (repeat_max > 1) repeat_max = 1;
6662 /* The case of a zero minimum is special because of the need to stick
6663 OP_BRAZERO in front of it, and because the group appears once in the
6664 data, whereas in other cases it appears the minimum number of times. For
6665 this reason, it is simplest to treat this case separately, as otherwise
6666 the code gets far too messy. There are several special subcases when the
6669 if (repeat_min == 0)
6671 /* If the maximum is also zero, we used to just omit the group from
6672 the output altogether, like this:
6674 ** if (repeat_max == 0)
6680 However, that fails when a group or a subgroup within it is
6681 referenced as a subroutine from elsewhere in the pattern, so now we
6682 stick in OP_SKIPZERO in front of it so that it is skipped on
6683 execution. As we don't have a list of which groups are referenced, we
6684 cannot do this selectively.
6686 If the maximum is 1 or unlimited, we just have to stick in the
6687 BRAZERO and do no more at this point. */
6689 if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
6691 (void)memmove(previous + 1, previous, CU2BYTES(len));
6693 if (repeat_max == 0)
6695 *previous++ = OP_SKIPZERO;
6698 brazeroptr = previous; /* Save for possessive optimizing */
6699 *previous++ = OP_BRAZERO + repeat_type;
6702 /* If the maximum is greater than 1 and limited, we have to replicate
6703 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6704 The first one has to be handled carefully because it's the original
6705 copy, which has to be moved up. The remainder can be handled by code
6706 that is common with the non-zero minimum case below. We have to
6707 adjust the value or repeat_max, since one less copy is required. */
6712 (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
6713 code += 2 + LINK_SIZE;
6714 *previous++ = OP_BRAZERO + repeat_type;
6715 *previous++ = OP_BRA;
6717 /* We chain together the bracket link offset fields that have to be
6718 filled in later when the ends of the brackets are reached. */
6720 linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
6722 PUTINC(previous, 0, linkoffset);
6725 if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
6728 /* If the minimum is greater than zero, replicate the group as many
6729 times as necessary, and adjust the maximum to the number of subsequent
6730 copies that we need. */
6736 /* In the pre-compile phase, we don't actually do the replication.
6737 We just adjust the length as if we had. Do some paranoid checks for
6738 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6739 integer type when available, otherwise double. */
6741 if (lengthptr != NULL)
6743 PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
6744 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6745 (INT64_OR_DOUBLE)length_prevgroup >
6746 (INT64_OR_DOUBLE)INT_MAX ||
6747 OFLOW_MAX - *lengthptr < delta)
6749 *errorcodeptr = ERR20;
6752 *lengthptr += delta;
6755 /* This is compiling for real. If there is a set first code unit
6756 for the group, and we have not yet set a "required code unit", set
6761 if (groupsetfirstcu && reqcuflags < 0)
6764 reqcuflags = firstcuflags;
6766 for (i = 1; (uint32_t)i < repeat_min; i++)
6768 memcpy(code, previous, CU2BYTES(len));
6774 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
6777 /* This code is common to both the zero and non-zero minimum cases. If
6778 the maximum is limited, it replicates the group in a nested fashion,
6779 remembering the bracket starts on a stack. In the case of a zero
6780 minimum, the first one was set up above. In all cases the repeat_max
6781 now specifies the number of additional copies needed. Again, we must
6782 remember to replicate entries on the forward reference list. */
6784 if (repeat_max != REPEAT_UNLIMITED)
6786 /* In the pre-compile phase, we don't actually do the replication. We
6787 just adjust the length as if we had. For each repetition we must add
6788 1 to the length for BRAZERO and for all but the last repetition we
6789 must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6790 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
6791 is a 64-bit integer type when available, otherwise double. */
6793 if (lengthptr != NULL && repeat_max > 0)
6795 PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6796 2 - 2*LINK_SIZE; /* Last one doesn't nest */
6797 if ((INT64_OR_DOUBLE)repeat_max *
6798 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6799 > (INT64_OR_DOUBLE)INT_MAX ||
6800 OFLOW_MAX - *lengthptr < delta)
6802 *errorcodeptr = ERR20;
6805 *lengthptr += delta;
6808 /* This is compiling for real */
6810 else for (i = repeat_max - 1; i >= 0; i--)
6812 *code++ = OP_BRAZERO + repeat_type;
6814 /* All but the final copy start a new nesting, maintaining the
6815 chain of brackets outstanding. */
6821 linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
6823 PUTINC(code, 0, linkoffset);
6826 memcpy(code, previous, CU2BYTES(len));
6830 /* Now chain through the pending brackets, and fill in their length
6831 fields (which are holding the chain links pro tem). */
6833 while (bralink != NULL)
6836 int linkoffset = (int)(code - bralink + 1);
6837 PCRE2_UCHAR *bra = code - linkoffset;
6838 oldlinkoffset = GET(bra, 1);
6839 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6841 PUTINC(code, 0, linkoffset);
6842 PUT(bra, 1, linkoffset);
6846 /* If the maximum is unlimited, set a repeater in the final copy. For
6847 ONCE brackets, that's all we need to do. However, possessively repeated
6848 ONCE brackets can be converted into non-capturing brackets, as the
6849 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6850 deal with possessive ONCEs specially.
6852 Otherwise, when we are doing the actual compile phase, check to see
6853 whether this group is one that could match an empty string. If so,
6854 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6855 that runtime checking can be done. [This check is also applied to ONCE
6856 groups at runtime, but in a different way.]
6858 Then, if the quantifier was possessive and the bracket is not a
6859 conditional, we convert the BRA code to the POS form, and the KET code to
6860 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6861 subpattern at both the start and at the end.) The use of special opcodes
6862 makes it possible to reduce greatly the stack usage in pcre2_match(). If
6863 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6865 Then, if the minimum number of matches is 1 or 0, cancel the possessive
6866 flag so that the default action below, of wrapping everything inside
6867 atomic brackets, does not happen. When the minimum is greater than 1,
6868 there will be earlier copies of the group, and so we still have to wrap
6873 PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
6874 PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
6876 /* Convert possessive ONCE brackets to non-capturing */
6878 if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
6880 /* For non-possessive ONCE brackets, all we need to do is to
6883 if (*bracode == OP_ONCE) *ketcode = OP_KETRMAX + repeat_type;
6885 /* Handle non-ONCE brackets and possessive ONCEs (which have been
6886 converted to non-capturing above). */
6890 /* In the compile phase, adjust the opcode if the group can match
6891 an empty string. For a conditional group with only one branch, the
6892 value of group_return will not show "could be empty", so we must
6893 check that separately. */
6895 if (lengthptr == NULL)
6897 if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
6898 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6899 *bracode = OP_SCOND;
6902 /* Handle possessive quantifiers. */
6904 if (possessive_quantifier)
6906 /* For COND brackets, we wrap the whole thing in a possessively
6907 repeated non-capturing bracket, because we have not invented POS
6908 versions of the COND opcodes. */
6910 if (*bracode == OP_COND || *bracode == OP_SCOND)
6912 int nlen = (int)(code - bracode);
6913 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
6914 code += 1 + LINK_SIZE;
6915 nlen += 1 + LINK_SIZE;
6916 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6917 *code++ = OP_KETRPOS;
6918 PUTINC(code, 0, nlen);
6919 PUT(bracode, 1, nlen);
6922 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6926 *bracode += 1; /* Switch to xxxPOS opcodes */
6927 *ketcode = OP_KETRPOS;
6930 /* If the minimum is zero, mark it as possessive, then unset the
6931 possessive flag when the minimum is 0 or 1. */
6933 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6934 if (repeat_min < 2) possessive_quantifier = FALSE;
6937 /* Non-possessive quantifier */
6939 else *ketcode = OP_KETRMAX + repeat_type;
6945 /* If previous was a character type match (\d or similar), abolish it and
6946 create a suitable repeat item. The code is shared with single-character
6947 repeats by setting op_type to add a suitable offset into repeat_type.
6948 Note the the Unicode property types will be present only when
6949 SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
6950 here because it just makes it horribly messy. */
6953 if (op_previous >= OP_EODN) /* Not a character type - internal error */
6955 *errorcodeptr = ERR10;
6960 int prop_type, prop_value;
6961 PCRE2_UCHAR *oldcode;
6963 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
6964 mclength = 0; /* Not a character */
6966 if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
6968 prop_type = previous[1];
6969 prop_value = previous[2];
6973 /* Come here from just above with a character in mcbuffer/mclength. */
6974 OUTPUT_SINGLE_REPEAT:
6975 prop_type = prop_value = -1;
6978 /* At this point, if prop_type == prop_value == -1 we either have a
6979 character in mcbuffer when mclength is greater than zero, or we have
6980 mclength zero, in which case there is a non-property character type in
6981 op_previous. If prop_type/value are not negative, we have a property
6982 character type in op_previous. */
6984 oldcode = code; /* Save where we were */
6985 code = previous; /* Usually overwrite previous item */
6987 /* If the maximum is zero then the minimum must also be zero; Perl allows
6988 this case, so we do too - by simply omitting the item altogether. */
6990 if (repeat_max == 0) goto END_REPEAT;
6992 /* Combine the op_type with the repeat_type */
6994 repeat_type += op_type;
6996 /* A minimum of zero is handled either as the special case * or ?, or as
6997 an UPTO, with the maximum given. */
6999 if (repeat_min == 0)
7001 if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7002 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7005 *code++ = OP_UPTO + repeat_type;
7006 PUT2INC(code, 0, repeat_max);
7010 /* A repeat minimum of 1 is optimized into some special cases. If the
7011 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7012 left in place and, if the maximum is greater than 1, we use OP_UPTO with
7013 one less than the maximum. */
7015 else if (repeat_min == 1)
7017 if (repeat_max == REPEAT_UNLIMITED)
7018 *code++ = OP_PLUS + repeat_type;
7021 code = oldcode; /* Leave previous item in place */
7022 if (repeat_max == 1) goto END_REPEAT;
7023 *code++ = OP_UPTO + repeat_type;
7024 PUT2INC(code, 0, repeat_max - 1);
7028 /* The case {n,n} is just an EXACT, while the general case {n,m} is
7029 handled as an EXACT followed by an UPTO or STAR or QUERY. */
7033 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
7034 PUT2INC(code, 0, repeat_min);
7036 /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7037 and then generate the second opcode. For a repeated Unicode property
7038 match, there are two extra values that define the required property,
7039 and mclength is set zero to indicate this. */
7041 if (repeat_max != repeat_min)
7045 memcpy(code, mcbuffer, CU2BYTES(mclength));
7050 *code++ = op_previous;
7053 *code++ = prop_type;
7054 *code++ = prop_value;
7058 /* Now set up the following opcode */
7060 if (repeat_max == REPEAT_UNLIMITED)
7061 *code++ = OP_STAR + repeat_type;
7064 repeat_max -= repeat_min;
7065 if (repeat_max == 1)
7067 *code++ = OP_QUERY + repeat_type;
7071 *code++ = OP_UPTO + repeat_type;
7072 PUT2INC(code, 0, repeat_max);
7078 /* Fill in the character or character type for the final opcode. */
7082 memcpy(code, mcbuffer, CU2BYTES(mclength));
7087 *code++ = op_previous;
7090 *code++ = prop_type;
7091 *code++ = prop_value;
7096 } /* End of switch on different op_previous values */
7099 /* If the character following a repeat is '+', possessive_quantifier is
7100 TRUE. For some opcodes, there are special alternative opcodes for this
7101 case. For anything else, we wrap the entire repeated item inside OP_ONCE
7102 brackets. Logically, the '+' notation is just syntactic sugar, taken from
7103 Sun's Java package, but the special opcodes can optimize it.
7105 Some (but not all) possessively repeated subpatterns have already been
7106 completely handled in the code just above. For them, possessive_quantifier
7107 is always FALSE at this stage. Note that the repeated item starts at
7108 tempcode, not at previous, which might be the first part of a string whose
7109 (former) last char we repeated. */
7111 if (possessive_quantifier)
7115 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7116 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7117 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7118 remains is greater than zero, there's a further opcode that can be
7119 handled. If not, do nothing, leaving the EXACT alone. */
7124 tempcode += PRIV(OP_lengths)[*tempcode] +
7125 ((tempcode[1 + IMM2_SIZE] == OP_PROP
7126 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7129 /* CHAR opcodes are used for exacts whose count is 1. */
7139 tempcode += PRIV(OP_lengths)[*tempcode];
7140 #ifdef SUPPORT_UNICODE
7141 if (utf && HAS_EXTRALEN(tempcode[-1]))
7142 tempcode += GET_EXTRALEN(tempcode[-1]);
7146 /* For the class opcodes, the repeat operator appears at the end;
7147 adjust tempcode to point to it. */
7151 tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7154 #ifdef SUPPORT_WIDE_CHARS
7156 tempcode += GET(tempcode, 1);
7161 /* If tempcode is equal to code (which points to the end of the repeated
7162 item), it means we have skipped an EXACT item but there is no following
7163 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7164 all other cases, tempcode will be pointing to the repeat opcode, and will
7165 be less than code, so the value of len will be greater than 0. */
7167 len = (int)(code - tempcode);
7170 unsigned int repcode = *tempcode;
7172 /* There is a table for possessifying opcodes, all of which are less
7173 than OP_CALLOUT. A zero entry means there is no possessified version.
7176 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7177 *tempcode = opcode_possessify[repcode];
7179 /* For opcode without a special possessified version, wrap the item in
7184 (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7185 code += 1 + LINK_SIZE;
7186 len += 1 + LINK_SIZE;
7187 tempcode[0] = OP_ONCE;
7189 PUTINC(code, 0, len);
7190 PUT(tempcode, 1, len);
7195 /* We set the "follows varying string" flag for subsequently encountered
7196 reqcus if it isn't already set and we have just passed a varying length
7200 cb->req_varyopt |= reqvary;
7204 /* ===================================================================*/
7205 /* Handle a 32-bit data character with a value greater than META_END. */
7212 /* ===============================================================*/
7213 /* Handle a back reference by number, which is the meta argument. The
7214 pattern offsets for back references to group numbers less than 10 are held
7215 in a special vector, to avoid using more than two parsed pattern elements
7216 in 64-bit environments. We only need the offset to the first occurrence,
7217 because if that doesn't fail, subsequent ones will also be OK. */
7220 if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7221 else GETPLUSOFFSET(offset, pptr);
7223 if (meta_arg > cb->bracount)
7225 cb->erroroffset = offset;
7226 *errorcodeptr = ERR15; /* Non-existent subpattern */
7230 /* Come here from named backref handling when the reference is to a
7231 single group (that is, not to a duplicated name). The back reference
7232 data will have already been updated. We must disable firstcu if not
7233 set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7236 HANDLE_SINGLE_REFERENCE:
7237 if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7238 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7239 PUT2INC(code, 0, meta_arg);
7241 /* Update the map of back references, and keep the highest one. We
7242 could do this in parse_regex() for numerical back references, but not
7243 for named back references, because we don't know the numbers to which
7244 named back references refer. So we do it all in this function. */
7246 cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7247 if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7249 /* Check to see if this back reference is recursive, that it, it
7250 is inside the group that it references. A flag is set so that the
7251 group can be made atomic. */
7253 for (oc = cb->open_caps; oc != NULL; oc = oc->next)
7255 if (oc->number == meta_arg)
7264 /* ===============================================================*/
7265 /* Handle recursion by inserting the number of the called group (which is
7266 the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7267 scanned and these numbers are replaced by offsets within the pattern. It is
7268 done like this to avoid problems with forward references and adjusting
7269 offsets when groups are duplicated and moved (as discovered in previous
7270 implementations). Note that a recursion does not have a set first character
7271 (relevant if it is repeated, because it will then be wrapped with ONCE
7275 GETPLUSOFFSET(offset, pptr);
7276 if (meta_arg > cb->bracount)
7278 cb->erroroffset = offset;
7279 *errorcodeptr = ERR15; /* Non-existent subpattern */
7282 HANDLE_NUMERICAL_RECURSION:
7284 PUT(code, 1, meta_arg);
7285 code += 1 + LINK_SIZE;
7286 groupsetfirstcu = FALSE;
7287 cb->had_recurse = TRUE;
7288 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7292 /* ===============================================================*/
7293 /* Handle capturing parentheses; the number is the meta argument. */
7297 skipunits = IMM2_SIZE;
7298 PUT2(code, 1+LINK_SIZE, meta_arg);
7299 cb->lastcapture = meta_arg;
7300 goto GROUP_PROCESS_NOTE_EMPTY;
7303 /* ===============================================================*/
7304 /* Handle escape sequence items. For ones like \d, the ESC_values are
7305 arranged to be the same as the corresponding OP_values in the default case
7306 when PCRE2_UCP is not set (which is the only case in which they will appear
7309 Note: \Q and \E are never seen here, as they were dealt with in
7310 parse_pattern(). Neither are numerical back references or recursions, which
7311 were turned into META_BACKREF or META_RECURSE items, respectively. \k and
7312 \g, when followed by names, are turned into META_BACKREF_BYNAME or
7313 META_RECURSE_BYNAME. */
7317 /* We can test for escape sequences that consume a character because their
7318 values lie between ESC_b and ESC_Z; this may have to change if any new ones
7319 are ever created. For these sequences, we disable the setting of a first
7320 character if it hasn't already been set. */
7322 if (meta_arg > ESC_b && meta_arg < ESC_Z)
7324 matched_char = TRUE;
7325 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7328 /* Set values to reset to if this is followed by a zero repeat. */
7330 zerofirstcu = firstcu;
7331 zerofirstcuflags = firstcuflags;
7333 zeroreqcuflags = reqcuflags;
7335 /* If Unicode is not supported, \P and \p are not allowed and are
7336 faulted at parse time, so will never appear here. */
7338 #ifdef SUPPORT_UNICODE
7339 if (meta_arg == ESC_P || meta_arg == ESC_p)
7341 uint32_t ptype = *(++pptr) >> 16;
7342 uint32_t pdata = *pptr & 0xffff;
7343 *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
7346 break; /* End META_ESCAPE */
7350 /* For the rest (including \X when Unicode is supported - if not it's
7351 faulted at parse time), the OP value is the escape value when PCRE2_UCP is
7352 not set; if it is set, these escapes do not show up here because they are
7353 converted into Unicode property tests in parse_regex(). Note that \b and \B
7354 do a one-character lookbehind, and \A also behaves as if it does. */
7356 if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7357 if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
7358 cb->max_lookbehind == 0)
7359 cb->max_lookbehind = 1;
7361 /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
7362 instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
7364 #if PCRE2_CODE_UNIT_WIDTH == 32
7365 *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7367 *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7369 break; /* End META_ESCAPE */
7372 /* ===================================================================*/
7373 /* Handle an unrecognized meta value. A parsed pattern value less than
7374 META_END is a literal. Otherwise we have a problem. */
7377 if (meta >= META_END)
7379 #ifdef DEBUG_SHOW_PARSED
7380 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
7382 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
7386 /* Handle a literal character. We come here by goto in the case of a
7387 32-bit, non-UTF character whose value is greater than META_END. */
7390 meta = *pptr; /* Get the full 32 bits */
7391 NORMAL_CHAR_SET: /* Character is already in meta */
7392 matched_char = TRUE;
7394 /* For caseless UTF mode, check whether this character has more than one
7395 other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
7397 #ifdef SUPPORT_UNICODE
7398 if (utf && (options & PCRE2_CASELESS) != 0)
7400 uint32_t caseset = UCD_CASESET(meta);
7406 if (firstcuflags == REQ_UNSET)
7407 firstcuflags = zerofirstcuflags = REQ_NONE;
7408 break; /* End handling this meta item */
7413 /* Caseful matches, or not one of the multicase characters. Get the
7414 character's code units into mcbuffer, with the length in mclength. When not
7415 in UTF mode, the length is always 1. */
7417 #ifdef SUPPORT_UNICODE
7418 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
7425 /* Generate the appropriate code */
7427 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7428 memcpy(code, mcbuffer, CU2BYTES(mclength));
7431 /* Remember if \r or \n were seen */
7433 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7434 cb->external_flags |= PCRE2_HASCRORLF;
7436 /* Set the first and required code units appropriately. If no previous
7437 first code unit, set it from this character, but revert to none on a zero
7438 repeat. Otherwise, leave the firstcu value alone, and don't change it on
7441 if (firstcuflags == REQ_UNSET)
7443 zerofirstcuflags = REQ_NONE;
7445 zeroreqcuflags = reqcuflags;
7447 /* If the character is more than one code unit long, we can set firstcu
7448 only if it is not to be matched caselessly. */
7450 if (mclength == 1 || req_caseopt == 0)
7452 firstcu = mcbuffer[0];
7453 firstcuflags = req_caseopt;
7457 reqcuflags = cb->req_varyopt;
7460 else firstcuflags = reqcuflags = REQ_NONE;
7463 /* firstcu was previously set; we can set reqcu only if the length is
7464 1 or the matching is caseful. */
7468 zerofirstcu = firstcu;
7469 zerofirstcuflags = firstcuflags;
7471 zeroreqcuflags = reqcuflags;
7472 if (mclength == 1 || req_caseopt == 0)
7475 reqcuflags = req_caseopt | cb->req_varyopt;
7478 break; /* End default meta handling */
7479 } /* End of big switch */
7480 } /* End of big loop */
7482 /* Control never reaches here. */
7487 /*************************************************
7488 * Compile regex: a sequence of alternatives *
7489 *************************************************/
7491 /* On entry, pptr is pointing past the bracket meta, but on return it points to
7492 the closing bracket or META_END. The code variable is pointing at the code unit
7493 into which the BRA operator has been stored. This function is used during the
7494 pre-compile phase when we are trying to find out the amount of memory needed,
7495 as well as during the real compile phase. The value of lengthptr distinguishes
7499 options option bits, including any changes for this subpattern
7500 codeptr -> the address of the current code pointer
7501 pptrptr -> the address of the current parsed pattern pointer
7502 errorcodeptr -> pointer to error code variable
7503 skipunits skip this many code units at start (for brackets and OP_COND)
7504 firstcuptr place to put the first required code unit
7505 firstcuflagsptr place to put the first code unit flags, or a negative number
7506 reqcuptr place to put the last required code unit
7507 reqcuflagsptr place to put the last required code unit flags, or a negative number
7508 bcptr pointer to the chain of currently open branches
7509 cb points to the data block with tables pointers etc.
7510 lengthptr NULL during the real compile phase
7511 points to length accumulator during pre-compile phase
7513 Returns: 0 There has been an error
7514 +1 Success, this group must match at least one character
7515 -1 Success, this group may match an empty string
7519 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
7520 int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
7521 int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
7522 branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
7524 PCRE2_UCHAR *code = *codeptr;
7525 PCRE2_UCHAR *last_branch = code;
7526 PCRE2_UCHAR *start_bracket = code;
7528 open_capitem capitem;
7531 uint32_t *pptr = *pptrptr;
7532 uint32_t firstcu, reqcu;
7533 uint32_t lookbehindlength;
7534 int32_t firstcuflags, reqcuflags;
7535 uint32_t branchfirstcu, branchreqcu;
7536 int32_t branchfirstcuflags, branchreqcuflags;
7540 /* If set, call the external function that checks for stack availability. */
7542 if (cb->cx->stack_guard != NULL &&
7543 cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
7545 *errorcodeptr= ERR33;
7549 /* Miscellaneous initialization */
7552 bc.current_branch = code;
7554 firstcu = reqcu = 0;
7555 firstcuflags = reqcuflags = REQ_UNSET;
7557 /* Accumulate the length for use in the pre-compile phase. Start with the
7558 length of the BRA and KET and any extra code units that are required at the
7559 beginning. We accumulate in a local variable to save frequent testing of
7560 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
7561 start and end of each alternative, because compiled items are discarded during
7562 the pre-compile phase so that the workspace is not exceeded. */
7564 length = 2 + 2*LINK_SIZE + skipunits;
7566 /* Remember if this is a lookbehind assertion, and if it is, save its length
7567 and skip over the pattern offset. */
7569 lookbehind = *code == OP_ASSERTBACK || *code == OP_ASSERTBACK_NOT;
7572 lookbehindlength = META_DATA(pptr[-1]);
7575 else lookbehindlength = 0;
7577 /* If this is a capturing subpattern, add to the chain of open capturing items
7578 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
7579 need be tested here; changing this opcode to one of its variants, e.g.
7580 OP_SCBRAPOS, happens later, after the group has been compiled. */
7582 if (*code == OP_CBRA)
7584 capnumber = GET2(code, 1 + LINK_SIZE);
7585 capitem.number = capnumber;
7586 capitem.next = cb->open_caps;
7587 capitem.flag = FALSE;
7588 capitem.assert_depth = cb->assert_depth;
7589 cb->open_caps = &capitem;
7592 /* Offset is set zero to mark that this bracket is still open */
7595 code += 1 + LINK_SIZE + skipunits;
7597 /* Loop for each alternative branch */
7603 /* Insert OP_REVERSE if this is as lookbehind assertion. */
7605 if (lookbehind && lookbehindlength > 0)
7607 *code++ = OP_REVERSE;
7608 PUTINC(code, 0, lookbehindlength);
7609 length += 1 + LINK_SIZE;
7612 /* Now compile the branch; in the pre-compile phase its length gets added
7615 if ((branch_return =
7616 compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
7617 &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
7618 cb, (lengthptr == NULL)? NULL : &length)) == 0)
7621 /* If a branch can match an empty string, so can the whole group. */
7623 if (branch_return < 0) okreturn = -1;
7625 /* In the real compile phase, there is some post-processing to be done. */
7627 if (lengthptr == NULL)
7629 /* If this is the first branch, the firstcu and reqcu values for the
7630 branch become the values for the regex. */
7632 if (*last_branch != OP_ALT)
7634 firstcu = branchfirstcu;
7635 firstcuflags = branchfirstcuflags;
7636 reqcu = branchreqcu;
7637 reqcuflags = branchreqcuflags;
7640 /* If this is not the first branch, the first char and reqcu have to
7641 match the values from all the previous branches, except that if the
7642 previous value for reqcu didn't have REQ_VARY set, it can still match,
7643 and we set REQ_VARY for the regex. */
7647 /* If we previously had a firstcu, but it doesn't match the new branch,
7648 we have to abandon the firstcu for the regex, but if there was
7649 previously no reqcu, it takes on the value of the old firstcu. */
7651 if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
7653 if (firstcuflags >= 0)
7658 reqcuflags = firstcuflags;
7661 firstcuflags = REQ_NONE;
7664 /* If we (now or from before) have no firstcu, a firstcu from the
7665 branch becomes a reqcu if there isn't a branch reqcu. */
7667 if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
7668 branchreqcuflags < 0)
7670 branchreqcu = branchfirstcu;
7671 branchreqcuflags = branchfirstcuflags;
7674 /* Now ensure that the reqcus match */
7676 if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
7677 reqcu != branchreqcu)
7678 reqcuflags = REQ_NONE;
7681 reqcu = branchreqcu;
7682 reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */
7687 /* Handle reaching the end of the expression, either ')' or end of pattern.
7688 In the real compile phase, go back through the alternative branches and
7689 reverse the chain of offsets, with the field in the BRA item now becoming an
7690 offset to the first alternative. If there are no alternatives, it points to
7691 the end of the group. The length in the terminating ket is always the length
7692 of the whole bracketed item. Return leaving the pointer at the terminating
7695 if (META_CODE(*pptr) != META_ALT)
7697 if (lengthptr == NULL)
7699 PCRE2_SIZE branch_length = code - last_branch;
7702 PCRE2_SIZE prev_length = GET(last_branch, 1);
7703 PUT(last_branch, 1, branch_length);
7704 branch_length = prev_length;
7705 last_branch -= branch_length;
7707 while (branch_length > 0);
7710 /* Fill in the ket */
7713 PUT(code, 1, (int)(code - start_bracket));
7714 code += 1 + LINK_SIZE;
7716 /* If it was a capturing subpattern, check to see if it contained any
7717 recursive back references. If so, we must wrap it in atomic brackets. In
7718 any event, remove the block from the chain. */
7722 if (cb->open_caps->flag)
7724 (void)memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
7725 CU2BYTES(code - start_bracket));
7726 *start_bracket = OP_ONCE;
7727 code += 1 + LINK_SIZE;
7728 PUT(start_bracket, 1, (int)(code - start_bracket));
7730 PUT(code, 1, (int)(code - start_bracket));
7731 code += 1 + LINK_SIZE;
7732 length += 2 + 2*LINK_SIZE;
7734 cb->open_caps = cb->open_caps->next;
7737 /* Set values to pass back */
7741 *firstcuptr = firstcu;
7742 *firstcuflagsptr = firstcuflags;
7744 *reqcuflagsptr = reqcuflags;
7745 if (lengthptr != NULL)
7747 if (OFLOW_MAX - *lengthptr < length)
7749 *errorcodeptr = ERR20;
7752 *lengthptr += length;
7757 /* Another branch follows. In the pre-compile phase, we can move the code
7758 pointer back to where it was for the start of the first branch. (That is,
7759 pretend that each branch is the only one.)
7761 In the real compile phase, insert an ALT node. Its length field points back
7762 to the previous branch while the bracket remains open. At the end the chain
7763 is reversed. It's done like this so that the start of the bracket has a
7764 zero offset until it is closed, making it possible to detect recursion. */
7766 if (lengthptr != NULL)
7768 code = *codeptr + 1 + LINK_SIZE + skipunits;
7769 length += 1 + LINK_SIZE;
7774 PUT(code, 1, (int)(code - last_branch));
7775 bc.current_branch = last_branch = code;
7776 code += 1 + LINK_SIZE;
7779 /* Set the lookbehind length (if not in a lookbehind the value will be zero)
7780 and then advance past the vertical bar. */
7782 lookbehindlength = META_DATA(*pptr);
7785 /* Control never reaches here */
7790 /*************************************************
7791 * Check for anchored pattern *
7792 *************************************************/
7794 /* Try to find out if this is an anchored regular expression. Consider each
7795 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
7796 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
7797 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
7798 be found, because ^ generates OP_CIRCM in that mode.
7800 We can also consider a regex to be anchored if OP_SOM starts all its branches.
7801 This is the code for \G, which means "match at start of match position, taking
7802 into account the match offset".
7804 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
7805 because that will try the rest of the pattern at all possible matching points,
7806 so there is no point trying again.... er ....
7808 .... except when the .* appears inside capturing parentheses, and there is a
7809 subsequent back reference to those parentheses. We haven't enough information
7810 to catch that case precisely.
7812 At first, the best we could do was to detect when .* was in capturing brackets
7813 and the highest back reference was greater than or equal to that level.
7814 However, by keeping a bitmap of the first 31 back references, we can catch some
7815 of the more common cases more precisely.
7817 ... A second exception is when the .* appears inside an atomic group, because
7818 this prevents the number of characters it matches from being adjusted.
7821 code points to start of the compiled pattern
7822 bracket_map a bitmap of which brackets we are inside while testing; this
7823 handles up to substring 31; after that we just have to take
7824 the less precise approach
7825 cb points to the compile data block
7826 atomcount atomic group level
7827 inassert TRUE if in an assertion
7829 Returns: TRUE or FALSE
7833 is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
7834 int atomcount, BOOL inassert)
7837 PCRE2_SPTR scode = first_significant_code(
7838 code + PRIV(OP_lengths)[*code], FALSE);
7841 /* Non-capturing brackets */
7843 if (op == OP_BRA || op == OP_BRAPOS ||
7844 op == OP_SBRA || op == OP_SBRAPOS)
7846 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
7850 /* Capturing brackets */
7852 else if (op == OP_CBRA || op == OP_CBRAPOS ||
7853 op == OP_SCBRA || op == OP_SCBRAPOS)
7855 int n = GET2(scode, 1+LINK_SIZE);
7856 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
7857 if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
7860 /* Positive forward assertion */
7862 else if (op == OP_ASSERT)
7864 if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
7867 /* Condition. If there is no second branch, it can't be anchored. */
7869 else if (op == OP_COND || op == OP_SCOND)
7871 if (scode[GET(scode,1)] != OP_ALT) return FALSE;
7872 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
7878 else if (op == OP_ONCE)
7880 if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
7884 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
7885 it isn't in brackets that are or may be referenced or inside an atomic
7886 group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
7887 because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
7888 with the subject "aab", which matches "b", i.e. not at the start of a line.
7889 There is also an option that disables auto-anchoring. */
7891 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7892 op == OP_TYPEPOSSTAR))
7894 if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
7895 atomcount > 0 || cb->had_pruneorskip || inassert ||
7896 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
7900 /* Check for explicit anchoring */
7902 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
7904 code += GET(code, 1);
7906 while (*code == OP_ALT); /* Loop for each alternative */
7912 /*************************************************
7913 * Check for starting with ^ or .* *
7914 *************************************************/
7916 /* This is called to find out if every branch starts with ^ or .* so that
7917 "first char" processing can be done to speed things up in multiline
7918 matching and for non-DOTALL patterns that start with .* (which must start at
7919 the beginning or after \n). As in the case of is_anchored() (see above), we
7920 have to take account of back references to capturing brackets that contain .*
7921 because in that case we can't make the assumption. Also, the appearance of .*
7922 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
7923 or *SKIP does not count, because once again the assumption no longer holds.
7926 code points to start of the compiled pattern or a group
7927 bracket_map a bitmap of which brackets we are inside while testing; this
7928 handles up to substring 31; after that we just have to take
7929 the less precise approach
7930 cb points to the compile data
7931 atomcount atomic group level
7932 inassert TRUE if in an assertion
7934 Returns: TRUE or FALSE
7938 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
7939 int atomcount, BOOL inassert)
7942 PCRE2_SPTR scode = first_significant_code(
7943 code + PRIV(OP_lengths)[*code], FALSE);
7946 /* If we are at the start of a conditional assertion group, *both* the
7947 conditional assertion *and* what follows the condition must satisfy the test
7948 for start of line. Other kinds of condition fail. Note that there may be an
7949 auto-callout at the start of a condition. */
7953 scode += 1 + LINK_SIZE;
7955 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
7956 else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
7969 default: /* Assertion */
7970 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
7971 do scode += GET(scode, 1); while (*scode == OP_ALT);
7972 scode += 1 + LINK_SIZE;
7975 scode = first_significant_code(scode, FALSE);
7979 /* Non-capturing brackets */
7981 if (op == OP_BRA || op == OP_BRAPOS ||
7982 op == OP_SBRA || op == OP_SBRAPOS)
7984 if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
7988 /* Capturing brackets */
7990 else if (op == OP_CBRA || op == OP_CBRAPOS ||
7991 op == OP_SCBRA || op == OP_SCBRAPOS)
7993 int n = GET2(scode, 1+LINK_SIZE);
7994 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
7995 if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
7998 /* Positive forward assertions */
8000 else if (op == OP_ASSERT)
8002 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8006 /* Atomic brackets */
8008 else if (op == OP_ONCE)
8010 if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8014 /* .* means "start at start or after \n" if it isn't in atomic brackets or
8015 brackets that may be referenced or an assertion, and as long as the pattern
8016 does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8017 for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8018 i.e. not at the start of a line. There is also an option that disables this
8021 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8023 if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8024 atomcount > 0 || cb->had_pruneorskip || inassert ||
8025 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8029 /* Check for explicit circumflex; anything else gives a FALSE result. Note
8030 in particular that this includes atomic brackets OP_ONCE because the number
8031 of characters matched by .* cannot be adjusted inside them. */
8033 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8035 /* Move on to the next alternative */
8037 code += GET(code, 1);
8039 while (*code == OP_ALT); /* Loop for each alternative */
8045 /*************************************************
8046 * Scan compiled regex for recursion reference *
8047 *************************************************/
8049 /* This function scans through a compiled pattern until it finds an instance of
8053 code points to start of expression
8054 utf TRUE in UTF mode
8056 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
8060 find_recurse(PCRE2_SPTR code, BOOL utf)
8064 PCRE2_UCHAR c = *code;
8065 if (c == OP_END) return NULL;
8066 if (c == OP_RECURSE) return code;
8068 /* XCLASS is used for classes that cannot be represented just by a bit map.
8069 This includes negated single high-valued characters. CALLOUT_STR is used for
8070 callouts with string arguments. In both cases the length in the table is
8071 zero; the actual length is stored in the compiled code. */
8073 if (c == OP_XCLASS) code += GET(code, 1);
8074 else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8076 /* Otherwise, we can get the item's length from the table, except that for
8077 repeated character types, we have to test for \p and \P, which have an extra
8078 two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8079 we must add in its length. */
8086 case OP_TYPEMINSTAR:
8088 case OP_TYPEMINPLUS:
8090 case OP_TYPEMINQUERY:
8091 case OP_TYPEPOSSTAR:
8092 case OP_TYPEPOSPLUS:
8093 case OP_TYPEPOSQUERY:
8094 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8097 case OP_TYPEPOSUPTO:
8099 case OP_TYPEMINUPTO:
8101 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8114 /* Add in the fixed length from the table */
8116 code += PRIV(OP_lengths)[c];
8118 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8119 be followed by a multi-unit character. The length in the table is a
8120 minimum, so we have to arrange to skip the extra units. */
8122 #ifdef MAYBE_UTF_MULTI
8140 case OP_NOTMINUPTOI:
8144 case OP_NOTPOSUPTOI:
8152 case OP_NOTMINSTARI:
8156 case OP_NOTPOSSTARI:
8164 case OP_NOTMINPLUSI:
8168 case OP_NOTPOSPLUSI:
8175 case OP_NOTMINQUERY:
8176 case OP_NOTMINQUERYI:
8179 case OP_NOTPOSQUERY:
8180 case OP_NOTPOSQUERYI:
8181 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8185 (void)(utf); /* Keep compiler happy by referencing function argument */
8186 #endif /* MAYBE_UTF_MULTI */
8193 /*************************************************
8194 * Check for asserted fixed first code unit *
8195 *************************************************/
8197 /* During compilation, the "first code unit" settings from forward assertions
8198 are discarded, because they can cause conflicts with actual literals that
8199 follow. However, if we end up without a first code unit setting for an
8200 unanchored pattern, it is worth scanning the regex to see if there is an
8201 initial asserted first code unit. If all branches start with the same asserted
8202 code unit, or with a non-conditional bracket all of whose alternatives start
8203 with the same asserted code unit (recurse ad lib), then we return that code
8204 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8205 REQ_NONE in the flags.
8208 code points to start of compiled pattern
8209 flags points to the first code unit flags
8210 inassert non-zero if in an assertion
8212 Returns: the fixed first code unit, or 0 with REQ_NONE in flags
8216 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert)
8219 int cflags = REQ_NONE;
8225 int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8226 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8227 PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8228 PCRE2_UCHAR op = *scode;
8243 d = find_firstassertedcu(scode, &dflags, inassert + ((op==OP_ASSERT)?1:0));
8246 if (cflags < 0) { c = d; cflags = dflags; }
8247 else if (c != d || cflags != dflags) return 0;
8258 if (inassert == 0) return 0;
8259 if (cflags < 0) { c = scode[1]; cflags = 0; }
8260 else if (c != scode[1]) return 0;
8271 if (inassert == 0) return 0;
8272 if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8273 else if (c != scode[1]) return 0;
8277 code += GET(code, 1);
8279 while (*code == OP_ALT);
8287 /*************************************************
8288 * Add an entry to the name/number table *
8289 *************************************************/
8291 /* This function is called between compiling passes to add an entry to the
8292 name/number table, maintaining alphabetical order. Checking for permitted
8293 and forbidden duplicates has already been done.
8296 cb the compile data block
8297 name the name to add
8298 length the length of the name
8299 groupno the group number
8300 tablecount the count of names in the table so far
8306 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8307 unsigned int groupno, uint32_t tablecount)
8310 PCRE2_UCHAR *slot = cb->name_table;
8312 for (i = 0; i < tablecount; i++)
8314 int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8315 if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8316 crc = -1; /* Current name is a substring */
8318 /* Make space in the table and break the loop for an earlier name. For a
8319 duplicate or later name, carry on. We do this for duplicates so that in the
8320 simple case (when ?(| is not used) they are in order of their numbers. In all
8321 cases they are in the order in which they appear in the pattern. */
8325 (void)memmove(slot + cb->name_entry_size, slot,
8326 CU2BYTES((tablecount - i) * cb->name_entry_size));
8330 /* Continue the loop for a later or duplicate name */
8332 slot += cb->name_entry_size;
8335 PUT2(slot, 0, groupno);
8336 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8338 /* Add a terminating zero and fill the rest of the slot with zeroes so that
8339 the memory is all initialized. Otherwise valgrind moans about uninitialized
8340 memory when saving serialized compiled patterns. */
8342 memset(slot + IMM2_SIZE + length, 0,
8343 CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8348 /*************************************************
8349 * Skip in parsed pattern *
8350 *************************************************/
8352 /* This function is called to skip parts of the parsed pattern when finding the
8353 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
8354 the end of the branch, it is called to skip over an internal lookaround, and it
8355 is also called to skip to the end of a class, during which it will never
8356 encounter nested groups (but there's no need to have special code for that).
8358 When called to find the end of a branch or group, pptr must point to the first
8359 meta code inside the branch, not the branch-starting code. In other cases it
8360 can point to the item that causes the function to be called.
8363 pptr current pointer to skip from
8364 skiptype PSKIP_CLASS when skipping to end of class
8365 PSKIP_ALT when META_ALT ends the skip
8366 PSKIP_KET when only META_KET ends the skip
8368 Returns: new value of pptr
8369 NULL if META_END is reached - should never occur
8370 or for an unknown meta value - likewise
8374 parsed_skip(uint32_t *pptr, uint32_t skiptype)
8376 uint32_t nestlevel = 0;
8380 uint32_t meta = META_CODE(*pptr);
8384 default: /* Just skip over most items */
8385 if (meta < META_END) continue; /* Literal */
8388 /* This should never occur. */
8393 /* The data for these items is variable in length. */
8395 case META_BACKREF: /* Offset is present only if group >= 10 */
8396 if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
8399 case META_ESCAPE: /* A few escapes are followed by data items. */
8400 switch (META_DATA(*pptr))
8409 pptr += 1 + SIZEOFFSET;
8414 case META_MARK: /* Add the length of the name. */
8415 case META_COMMIT_ARG:
8416 case META_PRUNE_ARG:
8422 /* These are the "active" items in this loop. */
8424 case META_CLASS_END:
8425 if (skiptype == PSKIP_CLASS) return pptr;
8430 case META_COND_ASSERT:
8431 case META_COND_DEFINE:
8432 case META_COND_NAME:
8433 case META_COND_NUMBER:
8434 case META_COND_RNAME:
8435 case META_COND_RNUMBER:
8436 case META_COND_VERSION:
8437 case META_LOOKAHEAD:
8438 case META_LOOKAHEADNOT:
8439 case META_LOOKBEHIND:
8440 case META_LOOKBEHINDNOT:
8441 case META_NOCAPTURE:
8446 if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
8450 if (nestlevel == 0) return pptr;
8455 /* The extra data item length for each meta is in a table. */
8457 meta = (meta >> 16) & 0x7fff;
8458 if (meta >= sizeof(meta_extra_lengths)) return NULL;
8459 pptr += meta_extra_lengths[meta];
8461 /* Control never reaches here */
8467 /*************************************************
8468 * Find length of a parsed group *
8469 *************************************************/
8471 /* This is called for nested groups within a branch of a lookbehind whose
8472 length is being computed. If all the branches in the nested group have the same
8473 length, that is OK. On entry, the pointer must be at the first element after
8474 the group initializing code. On exit it points to OP_KET. Caching is used to
8475 improve processing speed when the same capturing group occurs many times.
8478 pptrptr pointer to pointer in the parsed pattern
8479 isinline FALSE if a reference or recursion; TRUE for inline group
8480 errcodeptr pointer to the errorcode
8481 lcptr pointer to the loop counter
8482 group number of captured group or -1 for a non-capturing group
8483 recurses chain of recurse_check to catch mutual recursion
8484 cb pointer to the compile data
8486 Returns: the group length or a negative number
8490 get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
8491 int group, parsed_recurse_check *recurses, compile_block *cb)
8494 int grouplength = -1;
8496 /* The cache can be used only if there is no possibility of there being two
8497 groups with the same number. We do not need to set the end pointer for a group
8498 that is being processed as a back reference or recursion, but we must do so for
8501 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
8503 uint32_t groupinfo = cb->groupinfo[group];
8504 if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
8505 if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
8507 if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
8508 return groupinfo & GI_FIXED_LENGTH_MASK;
8512 /* Scan the group. In this case we find the end pointer of necessity. */
8516 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
8517 if (branchlength < 0) goto ISNOTFIXED;
8518 if (grouplength == -1) grouplength = branchlength;
8519 else if (grouplength != branchlength) goto ISNOTFIXED;
8520 if (**pptrptr == META_KET) break;
8521 *pptrptr += 1; /* Skip META_ALT */
8525 cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
8529 if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH;
8535 /*************************************************
8536 * Find length of a parsed branch *
8537 *************************************************/
8539 /* Return a fixed length for a branch in a lookbehind, giving an error if the
8540 length is not fixed. If any lookbehinds are encountered on the way, they get
8541 their length set. On entry, *pptrptr points to the first element inside the
8542 branch. On exit it is set to point to the ALT or KET.
8545 pptrptr pointer to pointer in the parsed pattern
8546 errcodeptr pointer to error code
8547 lcptr pointer to loop counter
8548 recurses chain of recurse_check to catch mutual recursion
8549 cb pointer to compile block
8551 Returns: the length, or a negative value on error
8555 get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
8556 parsed_recurse_check *recurses, compile_block *cb)
8558 int branchlength = 0;
8560 uint32_t lastitemlength = 0;
8561 uint32_t *pptr = *pptrptr;
8563 parsed_recurse_check this_recurse;
8565 /* A large and/or complex regex can take too long to process. This can happen
8566 more often when (?| groups are present in the pattern because their length
8567 cannot be cached. */
8569 if ((*lcptr)++ > 2000)
8571 *errcodeptr = ERR35; /* Lookbehind is too complicated */
8575 /* Scan the branch, accumulating the length. */
8579 parsed_recurse_check *r;
8580 uint32_t *gptr, *gptrend;
8583 uint32_t itemlength = 0;
8585 if (*pptr < META_END)
8590 else switch (META_CODE(*pptr))
8596 /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
8597 actual termination. */
8601 pptr = parsed_skip(pptr, PSKIP_ALT);
8602 if (pptr == NULL) goto PARSED_SKIP_FAILED;
8606 case META_COMMIT_ARG:
8607 case META_PRUNE_ARG:
8610 pptr += pptr[1] + 1;
8613 case META_CIRCUMFLEX:
8631 case META_CLASS_NOT:
8633 pptr = parsed_skip(pptr, PSKIP_CLASS);
8634 if (pptr == NULL) goto PARSED_SKIP_FAILED;
8637 case META_CLASS_EMPTY_NOT:
8642 case META_CALLOUT_NUMBER:
8646 case META_CALLOUT_STRING:
8647 pptr += 3 + SIZEOFFSET;
8650 /* Only some escapes consume a character. Of those, \R and \X are never
8651 allowed because they might match more than character. \C is allowed only in
8652 32-bit and non-UTF 8/16-bit modes. */
8655 escape = META_DATA(*pptr);
8656 if (escape == ESC_R || escape == ESC_X) return -1;
8657 if (escape > ESC_b && escape < ESC_Z)
8659 #if PCRE2_CODE_UNIT_WIDTH != 32
8660 if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
8662 *errcodeptr = ERR36;
8667 if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */
8671 /* Lookaheads can be ignored, but we must start the skip inside the group
8672 so that it isn't treated as a group within the branch. */
8674 case META_LOOKAHEAD:
8675 case META_LOOKAHEADNOT:
8676 pptr = parsed_skip(pptr + 1, PSKIP_KET);
8677 if (pptr == NULL) goto PARSED_SKIP_FAILED;
8679 /* Also ignore any qualifiers that follow a lookahead assertion. */
8684 case META_ASTERISK_PLUS:
8685 case META_ASTERISK_QUERY:
8687 case META_PLUS_PLUS:
8688 case META_PLUS_QUERY:
8690 case META_QUERY_PLUS:
8691 case META_QUERY_QUERY:
8696 case META_MINMAX_PLUS:
8697 case META_MINMAX_QUERY:
8706 /* Lookbehinds can be ignored, but must themselves be checked. */
8708 case META_LOOKBEHIND:
8709 case META_LOOKBEHINDNOT:
8710 if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
8714 /* Back references and recursions are handled by very similar code. At this
8715 stage, the names generated in the parsing pass are available, but the main
8716 name table has not yet been created. So for the named varieties, scan the
8717 list of names in order to get the number of the first one in the pattern,
8718 and whether or not this name is duplicated. */
8720 case META_BACKREF_BYNAME:
8721 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
8725 case META_RECURSE_BYNAME:
8729 BOOL is_dupname = FALSE;
8730 named_group *ng = cb->named_groups;
8731 uint32_t meta_code = META_CODE(*pptr);
8732 uint32_t length = *(++pptr);
8734 GETPLUSOFFSET(offset, pptr);
8735 name = cb->start_pattern + offset;
8736 for (i = 0; i < cb->names_found; i++, ng++)
8738 if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
8741 is_dupname = ng->isdup;
8748 *errcodeptr = ERR15; /* Non-existent subpattern */
8749 cb->erroroffset = offset;
8753 /* A numerical back reference can be fixed length if duplicate capturing
8754 groups are not being used. A non-duplicate named back reference can also
8757 if (meta_code == META_RECURSE_BYNAME ||
8758 (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
8759 goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */
8761 goto ISNOTFIXED; /* Duplicate name or number */
8763 /* The offset values for back references < 10 are in a separate vector
8764 because otherwise they would use more than two parsed pattern elements on
8768 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
8769 (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
8771 group = META_DATA(*pptr);
8774 offset = cb->small_ref_offset[group];
8775 goto RECURSE_OR_BACKREF_LENGTH;
8779 /* For groups >= 10 - picking up group twice does no harm. */
8781 /* A true recursion implies not fixed length, but a subroutine call may
8782 be OK. Back reference "recursions" are also failed. */
8785 group = META_DATA(*pptr);
8786 GETPLUSOFFSET(offset, pptr);
8788 RECURSE_OR_BACKREF_LENGTH:
8789 if (group > cb->bracount)
8791 cb->erroroffset = offset;
8792 *errcodeptr = ERR15; /* Non-existent subpattern */
8795 if (group == 0) goto ISNOTFIXED; /* Local recursion */
8796 for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
8798 if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
8799 else if (*gptr == (META_CAPTURE | group)) break;
8802 /* We must start the search for the end of the group at the first meta code
8803 inside the group. Otherwise it will be treated as an enclosed group. */
8805 gptrend = parsed_skip(gptr + 1, PSKIP_KET);
8806 if (gptrend == NULL) goto PARSED_SKIP_FAILED;
8807 if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
8808 for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
8809 if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
8810 this_recurse.prev = recurses;
8811 this_recurse.groupptr = gptr;
8813 /* We do not need to know the position of the end of the group, that is,
8814 gptr is not used after the call to get_grouplength(). Setting the second
8815 argument FALSE stops it scanning for the end when the length can be found
8819 grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
8821 if (grouplength < 0)
8823 if (*errcodeptr == 0) goto ISNOTFIXED;
8824 return -1; /* Error already set */
8826 itemlength = grouplength;
8829 /* Check nested groups - advance past the initial data for each type and
8830 then seek a fixed length with get_grouplength(). */
8832 case META_COND_NAME:
8833 case META_COND_NUMBER:
8834 case META_COND_RNAME:
8835 case META_COND_RNUMBER:
8836 case META_COND_DEFINE:
8837 pptr += 2 + SIZEOFFSET;
8840 case META_COND_ASSERT:
8844 case META_COND_VERSION:
8849 group = META_DATA(*pptr);
8853 case META_NOCAPTURE:
8856 grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
8858 if (grouplength < 0) return -1;
8859 itemlength = grouplength;
8862 /* Exact repetition is OK; variable repetition is not. A repetition of zero
8863 must subtract the length that has already been added. */
8866 case META_MINMAX_PLUS:
8867 case META_MINMAX_QUERY:
8868 if (pptr[1] == pptr[2])
8870 if (pptr[1] == 0) branchlength -= lastitemlength;
8871 else itemlength = (pptr[1] - 1) * lastitemlength;
8877 /* Any other item means this branch does not have a fixed length. */
8881 *errcodeptr = ERR25; /* Not fixed length */
8885 /* Add the item length to the branchlength, and save it for use if the next
8886 thing is a quantifier. */
8888 branchlength += itemlength;
8889 lastitemlength = itemlength;
8891 /* Ensure that the length does not overflow the limit. */
8893 if (branchlength > LOOKBEHIND_MAX)
8895 *errcodeptr = ERR87;
8902 if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
8903 return branchlength;
8906 *errcodeptr = ERR90;
8912 /*************************************************
8913 * Set lengths in a lookbehind *
8914 *************************************************/
8916 /* This function is called for each lookbehind, to set the lengths in its
8917 branches. An error occurs if any branch does not have a fixed length that is
8918 less than the maximum (65535). On exit, the pointer must be left on the final
8922 pptrptr pointer to pointer in the parsed pattern
8923 errcodeptr pointer to error code
8924 lcptr pointer to loop counter
8925 recurses chain of recurse_check to catch mutual recursion
8926 cb pointer to compile block
8928 Returns: TRUE if all is well
8929 FALSE otherwise, with error code and offset set
8933 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
8934 parsed_recurse_check *recurses, compile_block *cb)
8938 uint32_t *bptr = *pptrptr;
8940 READPLUSOFFSET(offset, bptr); /* Offset for error messages */
8941 *pptrptr += SIZEOFFSET;
8946 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
8947 if (branchlength < 0)
8949 /* The errorcode and offset may already be set from a nested lookbehind. */
8950 if (*errcodeptr == 0) *errcodeptr = ERR25;
8951 if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
8954 *bptr |= branchlength; /* branchlength never more than 65535 */
8957 while (*bptr == META_ALT);
8964 /*************************************************
8965 * Check parsed pattern lookbehinds *
8966 *************************************************/
8968 /* This function is called at the end of parsing a pattern if any lookbehinds
8969 were encountered. It scans the parsed pattern for them, calling
8970 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
8971 the error offset is marked unset. The enables the functions above not to
8972 override settings from deeper nestings.
8974 Arguments cb points to the compile block
8975 Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
8979 check_lookbehinds(compile_block *cb)
8985 cb->erroroffset = PCRE2_UNSET;
8987 for (pptr = cb->parsed_pattern; *pptr != META_END; pptr++)
8989 if (*pptr < META_END) continue; /* Literal */
8991 switch (META_CODE(*pptr))
8994 return ERR70; /* Unrecognized meta code */
8997 if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9004 case META_ASTERISK_PLUS:
9005 case META_ASTERISK_QUERY:
9009 case META_CIRCUMFLEX:
9011 case META_CLASS_EMPTY:
9012 case META_CLASS_EMPTY_NOT:
9013 case META_CLASS_END:
9014 case META_CLASS_NOT:
9016 case META_COND_ASSERT:
9021 case META_LOOKAHEAD:
9022 case META_LOOKAHEADNOT:
9023 case META_NOCAPTURE:
9025 case META_PLUS_PLUS:
9026 case META_PLUS_QUERY:
9029 case META_QUERY_PLUS:
9030 case META_QUERY_QUERY:
9031 case META_RANGE_ESCAPED:
9032 case META_RANGE_LITERAL:
9041 case META_BACKREF_BYNAME:
9042 case META_COND_DEFINE:
9043 case META_COND_NAME:
9044 case META_COND_NUMBER:
9045 case META_COND_RNAME:
9046 case META_COND_RNUMBER:
9047 case META_RECURSE_BYNAME:
9048 pptr += 1 + SIZEOFFSET;
9051 case META_CALLOUT_STRING:
9052 pptr += 3 + SIZEOFFSET;
9058 case META_POSIX_NEG:
9063 case META_MINMAX_QUERY:
9064 case META_MINMAX_PLUS:
9068 case META_CALLOUT_NUMBER:
9069 case META_COND_VERSION:
9074 case META_COMMIT_ARG:
9075 case META_PRUNE_ARG:
9078 pptr += 1 + pptr[1];
9081 case META_LOOKBEHIND:
9082 case META_LOOKBEHINDNOT:
9083 if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, NULL, cb))
9094 /*************************************************
9095 * External function to compile a pattern *
9096 *************************************************/
9098 /* This function reads a regular expression in the form of a string and returns
9099 a pointer to a block of store holding a compiled version of the expression.
9102 pattern the regular expression
9103 patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
9105 errorptr pointer to errorcode
9106 erroroffset pointer to error offset
9107 ccontext points to a compile context or is NULL
9109 Returns: pointer to compiled data block, or NULL on error,
9110 with errorcode and erroroffset set
9113 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
9114 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
9115 int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
9117 BOOL utf; /* Set TRUE for UTF mode */
9118 BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
9119 BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
9120 pcre2_real_code *re = NULL; /* What we will return */
9121 compile_block cb; /* "Static" compile-time data */
9122 const uint8_t *tables; /* Char tables base pointer */
9124 PCRE2_UCHAR *code; /* Current pointer in compiled code */
9125 PCRE2_SPTR codestart; /* Start of compiled code */
9126 PCRE2_SPTR ptr; /* Current pointer in pattern */
9127 uint32_t *pptr; /* Current pointer in parsed pattern */
9129 PCRE2_SIZE length = 1; /* Allow for final END opcode */
9130 PCRE2_SIZE usedlength; /* Actual length used */
9131 PCRE2_SIZE re_blocksize; /* Size of memory block */
9132 PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */
9133 PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
9135 int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
9136 uint32_t firstcu, reqcu; /* Value of first/req code unit */
9137 uint32_t setflags = 0; /* NL and BSR set flags */
9139 uint32_t skipatstart; /* When checking (*UTF) etc */
9140 uint32_t limit_heap = UINT32_MAX;
9141 uint32_t limit_match = UINT32_MAX; /* Unset match limits */
9142 uint32_t limit_depth = UINT32_MAX;
9144 int newline = 0; /* Unset; can be set by the pattern */
9145 int bsr = 0; /* Unset; can be set by the pattern */
9146 int errorcode = 0; /* Initialize to avoid compiler warn */
9147 int regexrc; /* Return from compile */
9149 uint32_t i; /* Local loop counter */
9151 /* Comments at the head of this file explain about these variables. */
9153 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
9154 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
9155 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9157 /* The workspace is used in different ways in the different compiling phases.
9158 It needs to be 16-bit aligned for the preliminary parsing scan. */
9160 uint32_t c16workspace[C16_WORK_SIZE];
9161 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
9164 /* -------------- Check arguments and set up the pattern ----------------- */
9166 /* There must be error code and offset pointers. */
9168 if (errorptr == NULL || erroroffset == NULL) return NULL;
9172 /* There must be a pattern! */
9174 if (pattern == NULL)
9180 /* A NULL compile context means "use a default context" */
9182 if (ccontext == NULL)
9183 ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
9185 /* Check that all undefined public option bits are zero. */
9187 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
9188 (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
9194 if ((options & PCRE2_LITERAL) != 0 &&
9195 ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
9196 (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
9202 /* A zero-terminated pattern is indicated by the special length value
9203 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
9205 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
9206 patlen = PRIV(strlen)(pattern);
9208 if (patlen > ccontext->max_pattern_length)
9214 /* From here on, all returns from this function should end up going via the
9218 /* ------------ Initialize the "static" compile data -------------- */
9220 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
9222 cb.lcc = tables + lcc_offset; /* Individual */
9223 cb.fcc = tables + fcc_offset; /* character */
9224 cb.cbits = tables + cbits_offset; /* tables */
9225 cb.ctypes = tables + ctypes_offset;
9227 cb.assert_depth = 0;
9230 cb.dupnames = FALSE;
9231 cb.end_pattern = pattern + patlen;
9233 cb.external_flags = 0;
9234 cb.external_options = options;
9235 cb.groupinfo = stack_groupinfo;
9236 cb.had_recurse = FALSE;
9238 cb.max_lookbehind = 0;
9239 cb.name_entry_size = 0;
9240 cb.name_table = NULL;
9241 cb.named_groups = named_groups;
9242 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
9244 cb.open_caps = NULL;
9245 cb.parens_depth = 0;
9246 cb.parsed_pattern = stack_parsed_pattern;
9248 cb.start_code = cworkspace;
9249 cb.start_pattern = pattern;
9250 cb.start_workspace = cworkspace;
9251 cb.workspace_size = COMPILE_WORK_SIZE;
9253 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9254 references to help in deciding whether (.*) can be treated as anchored or not.
9260 /* Escape sequences \1 to \9 are always back references, but as they are only
9261 two characters long, only two elements can be used in the parsed_pattern
9262 vector. The first contains the reference, and we'd like to use the second to
9263 record the offset in the pattern, so that forward references to non-existent
9264 groups can be diagnosed later with an offset. However, on 64-bit systems,
9265 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
9266 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
9267 references have enough space for the offset to be put into the parsed pattern.
9270 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
9273 /* --------------- Start looking at the pattern --------------- */
9275 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
9276 the start of the pattern, and remember the offset to the actual regex. With
9277 valgrind support, make the terminator of a zero-terminated pattern
9278 inaccessible. This catches bugs that would otherwise only show up for
9279 non-zero-terminated patterns. */
9281 #ifdef SUPPORT_VALGRIND
9282 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
9288 if ((options & PCRE2_LITERAL) == 0)
9290 while (patlen - skipatstart >= 2 &&
9291 ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9292 ptr[skipatstart+1] == CHAR_ASTERISK)
9294 for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
9297 pso *p = pso_list + i;
9299 if (patlen - skipatstart - 2 >= p->length &&
9300 PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
9303 skipatstart += p->length + 2;
9307 cb.external_options |= p->value;
9311 setflags |= p->value;
9316 setflags |= PCRE2_NL_SET;
9321 setflags |= PCRE2_BSR_SET;
9329 if (!IS_DIGIT(ptr[pp]))
9333 goto HAD_EARLY_ERROR;
9335 while (IS_DIGIT(ptr[pp]))
9337 if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
9338 c = c*10 + (ptr[pp++] - CHAR_0);
9340 if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
9344 goto HAD_EARLY_ERROR;
9346 if (p->type == PSO_LIMH) limit_heap = c;
9347 else if (p->type == PSO_LIMM) limit_match = c;
9348 else limit_depth = c;
9349 skipatstart += pp - skipatstart;
9352 break; /* Out of the table scan loop */
9355 if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
9359 /* End of pattern-start options; advance to start of real regex. */
9363 /* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */
9365 #ifndef SUPPORT_UNICODE
9366 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
9369 goto HAD_EARLY_ERROR;
9373 /* Check UTF. We have the original options in 'options', with that value as
9374 modified by (*UTF) etc in cb->external_options. The extra option
9375 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
9376 surrogate code points cannot be represented in UTF-16. */
9378 utf = (cb.external_options & PCRE2_UTF) != 0;
9381 if ((options & PCRE2_NEVER_UTF) != 0)
9384 goto HAD_EARLY_ERROR;
9386 if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
9387 (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
9388 goto HAD_ERROR; /* Offset was set by valid_utf() */
9390 #if PCRE2_CODE_UNIT_WIDTH == 16
9391 if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
9394 goto HAD_EARLY_ERROR;
9399 /* Check UCP lockout. */
9401 if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
9402 (PCRE2_UCP|PCRE2_NEVER_UCP))
9405 goto HAD_EARLY_ERROR;
9408 /* Process the BSR setting. */
9410 if (bsr == 0) bsr = ccontext->bsr_convention;
9412 /* Process the newline setting. */
9414 if (newline == 0) newline = ccontext->newline_convention;
9415 cb.nltype = NLTYPE_FIXED;
9418 case PCRE2_NEWLINE_CR:
9423 case PCRE2_NEWLINE_LF:
9428 case PCRE2_NEWLINE_NUL:
9430 cb.nl[0] = CHAR_NUL;
9433 case PCRE2_NEWLINE_CRLF:
9439 case PCRE2_NEWLINE_ANY:
9440 cb.nltype = NLTYPE_ANY;
9443 case PCRE2_NEWLINE_ANYCRLF:
9444 cb.nltype = NLTYPE_ANYCRLF;
9449 goto HAD_EARLY_ERROR;
9452 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
9453 their numerical equivalents, so that this information is always available for
9454 the remaining processing. (2) At the same time, parse the pattern and put a
9455 processed version into the parsed_pattern vector. This has escapes interpreted
9456 and comments removed (amongst other things).
9458 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
9459 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
9460 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
9461 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
9462 characters greater than META_END (0x80000000) have to be coded as two units. In
9463 this case, therefore, we scan the pattern to check for such values. */
9465 #if PCRE2_CODE_UNIT_WIDTH == 32
9469 for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
9473 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
9474 is set we have to assume a numerical callout (4 elements) for each character
9475 plus one at the end. This is overkill, but memory is plentiful these days. For
9476 many smaller patterns the vector on the stack (which was set up above) can be
9479 parsed_size_needed = patlen - skipatstart + big32count;
9481 if ((ccontext->extra_options &
9482 (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
9483 parsed_size_needed += 4;
9485 if ((options & PCRE2_AUTO_CALLOUT) != 0)
9486 parsed_size_needed = (parsed_size_needed + 1) * 5;
9488 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
9490 uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
9491 (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
9492 if (heap_parsed_pattern == NULL)
9497 cb.parsed_pattern = heap_parsed_pattern;
9499 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
9501 /* Do the parsing scan. */
9503 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
9504 if (errorcode != 0) goto HAD_CB_ERROR;
9506 /* Workspace is needed to remember information about numbered groups: whether a
9507 group can match an empty string and what its fixed length is. This is done to
9508 avoid the possibility of recursive references causing very long compile times
9509 when checking these features. Unnumbered groups do not have this exposure since
9510 they cannot be referenced. We use an indexed vector for this purpose. If there
9511 are sufficiently few groups, the default vector on the stack, as set up above,
9512 can be used. Otherwise we have to get/free a special vector. The vector must be
9513 initialized to zero. */
9515 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
9517 cb.groupinfo = ccontext->memctl.malloc(
9518 (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
9519 if (cb.groupinfo == NULL)
9526 memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
9528 /* If there were any lookbehinds, scan the parsed pattern to figure out their
9533 errorcode = check_lookbehinds(&cb);
9534 if (errorcode != 0) goto HAD_CB_ERROR;
9537 /* For debugging, there is a function that shows the parsed data vector. */
9539 #ifdef DEBUG_SHOW_PARSED
9540 fprintf(stderr, "+++ Pre-scan complete:\n");
9544 /* For debugging capturing information this code can be enabled. */
9546 #ifdef DEBUG_SHOW_CAPTURES
9548 named_group *ng = cb.named_groups;
9549 fprintf(stderr, "+++Captures: %d\n", cb.bracount);
9550 for (i = 0; i < cb.names_found; i++, ng++)
9552 fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
9557 /* Pretend to compile the pattern while actually just accumulating the amount
9558 of memory required in the 'length' variable. This behaviour is triggered by
9559 passing a non-NULL final argument to compile_regex(). We pass a block of
9560 workspace (cworkspace) for it to compile parts of the pattern into; the
9561 compiled code is discarded when it is no longer needed, so hopefully this
9562 workspace will never overflow, though there is a test for its doing so.
9564 On error, errorcode will be set non-zero, so we don't need to look at the
9565 result of the function. The initial options have been put into the cb block,
9566 but we still have to pass a separate options variable (the first argument)
9567 because the options may change as the pattern is processed. */
9569 cb.erroroffset = patlen; /* For any subsequent errors that do not set it */
9570 pptr = cb.parsed_pattern;
9574 (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
9575 &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
9577 if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */
9579 /* This should be caught in compile_regex(), but just in case... */
9581 if (length > MAX_PATTERN_SIZE)
9587 /* Compute the size of, and then get and initialize, the data block for storing
9588 the compiled pattern and names table. Integer overflow should no longer be
9589 possible because nowadays we limit the maximum value of cb.names_found and
9590 cb.name_entry_size. */
9592 re_blocksize = sizeof(pcre2_real_code) +
9594 (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
9595 re = (pcre2_real_code *)
9596 ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
9603 /* The compiler may put padding at the end of the pcre2_real_code structure in
9604 order to round it up to a multiple of 4 or 8 bytes. This means that when a
9605 compiled pattern is copied (for example, when serialized) undefined bytes are
9606 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
9607 write to the last 8 bytes of the structure before setting the fields. */
9609 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
9610 re->memctl = ccontext->memctl;
9611 re->tables = tables;
9612 re->executable_jit = NULL;
9613 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
9614 re->blocksize = re_blocksize;
9615 re->magic_number = MAGIC_NUMBER;
9616 re->compile_options = options;
9617 re->overall_options = cb.external_options;
9618 re->extra_options = ccontext->extra_options;
9619 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
9620 re->limit_heap = limit_heap;
9621 re->limit_match = limit_match;
9622 re->limit_depth = limit_depth;
9623 re->first_codeunit = 0;
9624 re->last_codeunit = 0;
9625 re->bsr_convention = bsr;
9626 re->newline_convention = newline;
9627 re->max_lookbehind = 0;
9629 re->top_bracket = 0;
9630 re->top_backref = 0;
9631 re->name_entry_size = cb.name_entry_size;
9632 re->name_count = cb.names_found;
9634 /* The basic block is immediately followed by the name table, and the compiled
9635 code follows after that. */
9637 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
9638 re->name_entry_size * re->name_count;
9640 /* Update the compile data block for the actual compile. The starting points of
9641 the name/number translation table and of the code are passed around in the
9642 compile data block. The start/end pattern and initial options are already set
9643 from the pre-compile phase, as is the name_entry_size field. */
9645 cb.parens_depth = 0;
9646 cb.assert_depth = 0;
9648 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
9649 cb.start_code = codestart;
9651 cb.had_accept = FALSE;
9652 cb.had_pruneorskip = FALSE;
9653 cb.open_caps = NULL;
9655 /* If any named groups were found, create the name/number table from the list
9656 created in the pre-pass. */
9658 if (cb.names_found > 0)
9660 named_group *ng = cb.named_groups;
9661 for (i = 0; i < cb.names_found; i++, ng++)
9662 add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
9665 /* Set up a starting, non-extracting bracket, then compile the expression. On
9666 error, errorcode will be set non-zero, so we don't need to look at the result
9667 of the function here. */
9669 pptr = cb.parsed_pattern;
9670 code = (PCRE2_UCHAR *)codestart;
9672 regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
9673 &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
9674 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
9675 re->top_bracket = cb.bracount;
9676 re->top_backref = cb.top_backref;
9677 re->max_lookbehind = cb.max_lookbehind;
9681 reqcu = 0; /* Must disable after (*ACCEPT) */
9682 reqcuflags = REQ_NONE;
9685 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
9686 but the estimated length exceeds the really used length, adjust the value of
9687 re->blocksize, and if valgrind support is configured, mark the extra allocated
9688 memory as unaddressable, so that any out-of-bound reads can be detected. */
9691 usedlength = code - codestart;
9692 if (usedlength > length) errorcode = ERR23; else
9694 re->blocksize -= CU2BYTES(length - usedlength);
9695 #ifdef SUPPORT_VALGRIND
9696 VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
9700 /* Scan the pattern for recursion/subroutine calls and convert the group
9701 numbers into offsets. Maintain a small cache so that repeated groups containing
9702 recursions are efficiently handled. */
9704 #define RSCAN_CACHE_SIZE 8
9706 if (errorcode == 0 && cb.had_recurse)
9710 unsigned int ccount = 0;
9711 int start = RSCAN_CACHE_SIZE;
9712 recurse_cache rc[RSCAN_CACHE_SIZE];
9714 for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
9716 rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
9720 groupnumber = (int)GET(rcode, 1);
9721 if (groupnumber == 0) rgroup = codestart; else
9723 PCRE2_SPTR search_from = codestart;
9725 for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
9727 if (groupnumber == rc[p].groupnumber)
9729 rgroup = rc[p].group;
9733 /* Group n+1 must always start to the right of group n, so we can save
9734 search time below when the new group number is greater than any of the
9735 previously found groups. */
9737 if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
9742 rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
9748 if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
9749 rc[start].groupnumber = groupnumber;
9750 rc[start].group = rgroup;
9751 if (ccount < RSCAN_CACHE_SIZE) ccount++;
9755 PUT(rcode, 1, rgroup - codestart);
9759 /* In rare debugging situations we sometimes need to look at the compiled code
9762 #ifdef DEBUG_CALL_PRINTINT
9763 pcre2_printint(re, stderr, TRUE);
9764 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
9767 /* Unless disabled, check whether any single character iterators can be
9768 auto-possessified. The function overwrites the appropriate opcode values, so
9769 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9770 used in this code because at least one compiler gives a warning about loss of
9771 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
9774 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
9776 PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
9777 if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
9780 /* Failed to compile, or error while post-processing. */
9782 if (errorcode != 0) goto HAD_CB_ERROR;
9784 /* Successful compile. If the anchored option was not passed, set it if
9785 we can determine that the pattern is anchored by virtue of ^ characters or \A
9786 or anything else, such as starting with non-atomic .* when DOTALL is set and
9787 there are no occurrences of *PRUNE or *SKIP (though there is an option to
9788 disable this case). */
9790 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
9791 is_anchored(codestart, 0, &cb, 0, FALSE))
9792 re->overall_options |= PCRE2_ANCHORED;
9794 /* Set up the first code unit or startline flag, the required code unit, and
9795 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
9796 is set, as the data it would create will not be used. Note that a first code
9797 unit (but not the startline flag) is useful for anchored patterns because it
9798 can still give a quick "no match" and also avoid searching for a last code
9801 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
9803 /* If we do not have a first code unit, see if there is one that is asserted
9804 (these are not saved during the compile because they can cause conflicts with
9805 actual literals that follow). */
9807 if (firstcuflags < 0)
9808 firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
9810 /* Save the data for a first code unit. */
9812 if (firstcuflags >= 0)
9814 re->first_codeunit = firstcu;
9815 re->flags |= PCRE2_FIRSTSET;
9817 /* Handle caseless first code units. */
9819 if ((firstcuflags & REQ_CASELESS) != 0)
9821 if (firstcu < 128 || (!utf && firstcu < 255))
9823 if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
9826 /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
9827 8-bit UTF mode, codepoints in the range 128-255 are introductory code
9828 points and cannot have another case. In 16-bit and 32-bit modes, we can
9829 check wide characters when UTF (and therefore UCP) is supported. */
9831 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
9832 else if (firstcu <= MAX_UTF_CODE_POINT &&
9833 UCD_OTHERCASE(firstcu) != firstcu)
9834 re->flags |= PCRE2_FIRSTCASELESS;
9839 /* When there is no first code unit, for non-anchored patterns, see if we can
9840 set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
9841 branches start with ^ and also when all branches start with non-atomic .* for
9842 non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
9843 that disables this case.) */
9845 else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
9846 is_startline(codestart, 0, &cb, 0, FALSE))
9847 re->flags |= PCRE2_STARTLINE;
9849 /* Handle the "required code unit", if one is set. In the case of an anchored
9850 pattern, do this only if it follows a variable length item in the pattern. */
9852 if (reqcuflags >= 0 &&
9853 ((re->overall_options & PCRE2_ANCHORED) == 0 ||
9854 (reqcuflags & REQ_VARY) != 0))
9856 re->last_codeunit = reqcu;
9857 re->flags |= PCRE2_LASTSET;
9859 /* Handle caseless required code units as for first code units (above). */
9861 if ((reqcuflags & REQ_CASELESS) != 0)
9863 if (reqcu < 128 || (!utf && reqcu < 255))
9865 if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
9867 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
9868 else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
9869 re->flags |= PCRE2_LASTCASELESS;
9874 /* Finally, study the compiled pattern to set up information such as a bitmap
9875 of starting code units and a minimum matching length. */
9877 if (PRIV(study)(re) != 0)
9882 } /* End of start-of-match optimizations. */
9884 /* Control ends up here in all cases. When running under valgrind, make a
9885 pattern's terminating zero defined again. If memory was obtained for the parsed
9886 version of the pattern, free it before returning. Also free the list of named
9887 groups if a larger one had to be obtained, and likewise the group information
9891 #ifdef SUPPORT_VALGRIND
9892 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
9894 if (cb.parsed_pattern != stack_parsed_pattern)
9895 ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
9896 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
9897 ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
9898 if (cb.groupinfo != stack_groupinfo)
9899 ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
9900 return re; /* Will be NULL after an error */
9902 /* Errors discovered in parse_regex() set the offset value in the compile
9903 block. Errors discovered before it is called must compute it from the ptr
9904 value. After parse_regex() is called, the offset in the compile block is set to
9905 the end of the pattern, but certain errors in compile_regex() may reset it if
9906 an offset is available in the parsed pattern. */
9909 ptr = pattern + cb.erroroffset;
9912 *erroroffset = ptr - pattern;
9915 *errorptr = errorcode;
9916 pcre2_code_free(re);
9921 /* End of pcre2_compile.c */