1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2018 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
42 /* This module contains mode-dependent macro and structure definitions. The
43 file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
44 These mode-dependent items are kept in a separate file so that they can also be
45 #included multiple times for different code unit widths by pcre2test in order
46 to have access to the hidden structures at all supported widths.
48 Some of the mode-dependent macros are required at different widths for
49 different parts of the pcre2test code (in particular, the included
50 pcre_printint.c file). We undefine them here so that they can be re-defined for
51 multiple inclusions. Not all of these are used in pcre2test, but it's easier
52 just to undefine them all. */
60 #undef FORWARDCHARTEST
74 #undef MAX_PATTERN_SIZE
75 #undef MAX_UTF_SINGLE_CU
86 /* -------------------------- MACROS ----------------------------- */
88 /* PCRE keeps offsets in its compiled code as at least 16-bit quantities
89 (always stored in big-endian order in 8-bit mode) by default. These are used,
90 for example, to link from the start of a subpattern to its alternatives and its
91 end. The use of 16 bits per offset limits the size of an 8-bit compiled regex
92 to around 64K, which is big enough for almost everybody. However, I received a
93 request for an even bigger limit. For this reason, and also to make the code
94 easier to maintain, the storing and loading of offsets from the compiled code
95 unit string is now handled by the macros that are defined here.
97 The macros are controlled by the value of LINK_SIZE. This defaults to 2, but
98 values of 3 or 4 are also supported. */
100 /* ------------------- 8-bit support ------------------ */
102 #if PCRE2_CODE_UNIT_WIDTH == 8
106 (a[n] = (PCRE2_UCHAR)((d) >> 8)), \
107 (a[(n)+1] = (PCRE2_UCHAR)((d) & 255))
109 (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
110 #define MAX_PATTERN_SIZE (1 << 16)
114 (a[n] = (PCRE2_UCHAR)((d) >> 16)), \
115 (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \
116 (a[(n)+2] = (PCRE2_UCHAR)((d) & 255))
118 (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
119 #define MAX_PATTERN_SIZE (1 << 24)
123 (a[n] = (PCRE2_UCHAR)((d) >> 24)), \
124 (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \
125 (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)), \
126 (a[(n)+3] = (PCRE2_UCHAR)((d) & 255))
128 (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
129 #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
132 #error LINK_SIZE must be 2, 3, or 4
136 /* ------------------- 16-bit support ------------------ */
138 #elif PCRE2_CODE_UNIT_WIDTH == 16
144 (a[n] = (PCRE2_UCHAR)(d))
147 #define MAX_PATTERN_SIZE (1 << 16)
149 #elif LINK_SIZE == 3 || LINK_SIZE == 4
153 (a[n] = (PCRE2_UCHAR)((d) >> 16)), \
154 (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535))
156 (unsigned int)(((a)[n] << 16) | (a)[(n)+1])
157 #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
160 #error LINK_SIZE must be 2, 3, or 4
164 /* ------------------- 32-bit support ------------------ */
166 #elif PCRE2_CODE_UNIT_WIDTH == 32
173 #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
176 #error Unsupported compiling mode
180 /* --------------- Other mode-specific macros ----------------- */
182 /* PCRE uses some other (at least) 16-bit quantities that do not change when
183 the size of offsets changes. There are used for repeat counts and for other
184 things such as capturing parenthesis numbers in back references.
186 Define the number of code units required to hold a 16-bit count/offset, and
187 macros to load and store such a value. For reasons that I do not understand,
188 the expression in the 8-bit GET2 macro is treated by gcc as a signed
189 expression, even when a is declared as unsigned. It seems that any kind of
190 arithmetic results in a signed value. Hence the cast. */
192 #if PCRE2_CODE_UNIT_WIDTH == 8
194 #define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
195 #define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255
197 #else /* Code units are 16 or 32 bits */
199 #define GET2(a,n) a[n]
200 #define PUT2(a,n,d) a[n] = d
203 /* Other macros that are different for 8-bit mode. The MAX_255 macro checks
204 whether its argument, which is assumed to be one code unit, is less than 256.
205 The CHMAX_255 macro does not assume one code unit. The maximum length of a MARK
206 name must fit in one code unit; currently it is set to 255 or 65535. The
207 TABLE_GET macro is used to access elements of tables containing exactly 256
208 items. When code points can be greater than 255, a check is needed before
209 accessing these tables. */
211 #if PCRE2_CODE_UNIT_WIDTH == 8
212 #define MAX_255(c) TRUE
213 #define MAX_MARK ((1u << 8) - 1)
214 #ifdef SUPPORT_UNICODE
215 #define SUPPORT_WIDE_CHARS
216 #define CHMAX_255(c) ((c) <= 255u)
218 #define CHMAX_255(c) TRUE
219 #endif /* SUPPORT_UNICODE */
220 #define TABLE_GET(c, table, default) ((table)[c])
222 #else /* Code units are 16 or 32 bits */
223 #define CHMAX_255(c) ((c) <= 255u)
224 #define MAX_255(c) ((c) <= 255u)
225 #define MAX_MARK ((1u << 16) - 1)
226 #define SUPPORT_WIDE_CHARS
227 #define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default))
232 /* ----------------- Character-handling macros ----------------- */
234 /* There is a proposed future special "UTF-21" mode, in which only the lowest
235 21 bits of a 32-bit character are interpreted as UTF, with the remaining 11
236 high-order bits available to the application for other uses. In preparation for
237 the future implementation of this mode, there are macros that load a data item
238 and, if in this special mode, mask it to 21 bits. These macros all have names
239 starting with UCHAR21. In all other modes, including the normal 32-bit
240 library, the macros all have the same simple definitions. When the new mode is
241 implemented, it is expected that these definitions will be varied appropriately
242 using #ifdef when compiling the library that supports the special mode. */
244 #define UCHAR21(eptr) (*(eptr))
245 #define UCHAR21TEST(eptr) (*(eptr))
246 #define UCHAR21INC(eptr) (*(eptr)++)
247 #define UCHAR21INCTEST(eptr) (*(eptr)++)
249 /* When UTF encoding is being used, a character is no longer just a single
250 byte in 8-bit mode or a single short in 16-bit mode. The macros for character
251 handling generate simple sequences when used in the basic mode, and more
252 complicated ones for UTF characters. GETCHARLENTEST and other macros are not
253 used when UTF is not supported. To make sure they can never even appear when
254 UTF support is omitted, we don't even define them. */
256 #ifndef SUPPORT_UNICODE
258 /* #define MAX_UTF_SINGLE_CU */
259 /* #define HAS_EXTRALEN(c) */
260 /* #define GET_EXTRALEN(c) */
261 /* #define NOT_FIRSTCU(c) */
262 #define GETCHAR(c, eptr) c = *eptr;
263 #define GETCHARTEST(c, eptr) c = *eptr;
264 #define GETCHARINC(c, eptr) c = *eptr++;
265 #define GETCHARINCTEST(c, eptr) c = *eptr++;
266 #define GETCHARLEN(c, eptr, len) c = *eptr;
267 #define PUTCHAR(c, p) (*p = c, 1)
268 /* #define GETCHARLENTEST(c, eptr, len) */
269 /* #define BACKCHAR(eptr) */
270 /* #define FORWARDCHAR(eptr) */
271 /* #define FORWARCCHARTEST(eptr,end) */
272 /* #define ACROSSCHAR(condition, eptr, action) */
274 #else /* SUPPORT_UNICODE */
276 /* ------------------- 8-bit support ------------------ */
278 #if PCRE2_CODE_UNIT_WIDTH == 8
279 #define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */
281 /* The largest UTF code point that can be encoded as a single code unit. */
283 #define MAX_UTF_SINGLE_CU 127
285 /* Tests whether the code point needs extra characters to decode. */
287 #define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c)
289 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
290 Otherwise it has an undefined behaviour. */
292 #define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu])
294 /* Returns TRUE, if the given value is not the first code unit of a UTF
297 #define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u)
299 /* Get the next UTF-8 character, not advancing the pointer. This is called when
300 we know we are in UTF-8 mode. */
302 #define GETCHAR(c, eptr) \
304 if (c >= 0xc0u) GETUTF8(c, eptr);
306 /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
309 #define GETCHARTEST(c, eptr) \
311 if (utf && c >= 0xc0u) GETUTF8(c, eptr);
313 /* Get the next UTF-8 character, advancing the pointer. This is called when we
314 know we are in UTF-8 mode. */
316 #define GETCHARINC(c, eptr) \
318 if (c >= 0xc0u) GETUTF8INC(c, eptr);
320 /* Get the next character, testing for UTF-8 mode, and advancing the pointer.
321 This is called when we don't know if we are in UTF-8 mode. */
323 #define GETCHARINCTEST(c, eptr) \
325 if (utf && c >= 0xc0u) GETUTF8INC(c, eptr);
327 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
328 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
330 #define GETCHARLEN(c, eptr, len) \
332 if (c >= 0xc0u) GETUTF8LEN(c, eptr, len);
334 /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
335 pointer, incrementing length if there are extra bytes. This is called when we
336 do not know if we are in UTF-8 mode. */
338 #define GETCHARLENTEST(c, eptr, len) \
340 if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len);
342 /* If the pointer is not at the start of a character, move it back until
343 it is. This is called only in UTF-8 mode - we don't put a test within the macro
344 because almost all calls are already within a block of UTF-8 only code. */
346 #define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr--
348 /* Same as above, just in the other direction. */
349 #define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++
350 #define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++
352 /* Same as above, but it allows a fully customizable form. */
353 #define ACROSSCHAR(condition, eptr, action) \
354 while((condition) && ((*eptr) & 0xc0u) == 0x80u) action
356 /* Deposit a character into memory, returning the number of code units. */
358 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
359 PRIV(ord2utf)(c,p) : (*p = c, 1))
362 /* ------------------- 16-bit support ------------------ */
364 #elif PCRE2_CODE_UNIT_WIDTH == 16
365 #define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */
367 /* The largest UTF code point that can be encoded as a single code unit. */
369 #define MAX_UTF_SINGLE_CU 65535
371 /* Tests whether the code point needs extra characters to decode. */
373 #define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u)
375 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
376 Otherwise it has an undefined behaviour. */
378 #define GET_EXTRALEN(c) 1
380 /* Returns TRUE, if the given value is not the first code unit of a UTF
383 #define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u)
385 /* Base macro to pick up the low surrogate of a UTF-16 character, not
386 advancing the pointer. */
388 #define GETUTF16(c, eptr) \
389 { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; }
391 /* Get the next UTF-16 character, not advancing the pointer. This is called when
392 we know we are in UTF-16 mode. */
394 #define GETCHAR(c, eptr) \
396 if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
398 /* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
401 #define GETCHARTEST(c, eptr) \
403 if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
405 /* Base macro to pick up the low surrogate of a UTF-16 character, advancing
408 #define GETUTF16INC(c, eptr) \
409 { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; }
411 /* Get the next UTF-16 character, advancing the pointer. This is called when we
412 know we are in UTF-16 mode. */
414 #define GETCHARINC(c, eptr) \
416 if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
418 /* Get the next character, testing for UTF-16 mode, and advancing the pointer.
419 This is called when we don't know if we are in UTF-16 mode. */
421 #define GETCHARINCTEST(c, eptr) \
423 if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
425 /* Base macro to pick up the low surrogate of a UTF-16 character, not
426 advancing the pointer, incrementing the length. */
428 #define GETUTF16LEN(c, eptr, len) \
429 { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; }
431 /* Get the next UTF-16 character, not advancing the pointer, incrementing
432 length if there is a low surrogate. This is called when we know we are in
435 #define GETCHARLEN(c, eptr, len) \
437 if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
439 /* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
440 pointer, incrementing length if there is a low surrogate. This is called when
441 we do not know if we are in UTF-16 mode. */
443 #define GETCHARLENTEST(c, eptr, len) \
445 if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
447 /* If the pointer is not at the start of a character, move it back until
448 it is. This is called only in UTF-16 mode - we don't put a test within the
449 macro because almost all calls are already within a block of UTF-16 only
452 #define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr--
454 /* Same as above, just in the other direction. */
455 #define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++
456 #define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++
458 /* Same as above, but it allows a fully customizable form. */
459 #define ACROSSCHAR(condition, eptr, action) \
460 if ((condition) && ((*eptr) & 0xfc00u) == 0xdc00u) action
462 /* Deposit a character into memory, returning the number of code units. */
464 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
465 PRIV(ord2utf)(c,p) : (*p = c, 1))
468 /* ------------------- 32-bit support ------------------ */
472 /* These are trivial for the 32-bit library, since all UTF-32 characters fit
473 into one PCRE2_UCHAR unit. */
475 #define MAX_UTF_SINGLE_CU (0x10ffffu)
476 #define HAS_EXTRALEN(c) (0)
477 #define GET_EXTRALEN(c) (0)
478 #define NOT_FIRSTCU(c) (0)
480 /* Get the next UTF-32 character, not advancing the pointer. This is called when
481 we know we are in UTF-32 mode. */
483 #define GETCHAR(c, eptr) \
486 /* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the
489 #define GETCHARTEST(c, eptr) \
492 /* Get the next UTF-32 character, advancing the pointer. This is called when we
493 know we are in UTF-32 mode. */
495 #define GETCHARINC(c, eptr) \
498 /* Get the next character, testing for UTF-32 mode, and advancing the pointer.
499 This is called when we don't know if we are in UTF-32 mode. */
501 #define GETCHARINCTEST(c, eptr) \
504 /* Get the next UTF-32 character, not advancing the pointer, not incrementing
505 length (since all UTF-32 is of length 1). This is called when we know we are in
508 #define GETCHARLEN(c, eptr, len) \
511 /* Get the next UTF-32character, testing for UTF-32 mode, not advancing the
512 pointer, not incrementing the length (since all UTF-32 is of length 1).
513 This is called when we do not know if we are in UTF-32 mode. */
515 #define GETCHARLENTEST(c, eptr, len) \
518 /* If the pointer is not at the start of a character, move it back until
519 it is. This is called only in UTF-32 mode - we don't put a test within the
520 macro because almost all calls are already within a block of UTF-32 only
523 These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */
525 #define BACKCHAR(eptr) do { } while (0)
527 /* Same as above, just in the other direction. */
529 #define FORWARDCHAR(eptr) do { } while (0)
530 #define FORWARDCHARTEST(eptr,end) do { } while (0)
532 /* Same as above, but it allows a fully customizable form. */
534 #define ACROSSCHAR(condition, eptr, action) do { } while (0)
536 /* Deposit a character into memory, returning the number of code units. */
538 #define PUTCHAR(c, p) (*p = c, 1)
540 #endif /* UTF-32 character handling */
541 #endif /* SUPPORT_UNICODE */
544 /* Mode-dependent macros that have the same definition in all modes. */
546 #define CU2BYTES(x) ((x)*((PCRE2_CODE_UNIT_WIDTH/8)))
547 #define BYTES2CU(x) ((x)/((PCRE2_CODE_UNIT_WIDTH/8)))
548 #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
549 #define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE
552 /* ----------------------- HIDDEN STRUCTURES ----------------------------- */
554 /* NOTE: All these structures *must* start with a pcre2_memctl structure. The
555 code that uses them is simpler because it assumes this. */
557 /* The real general context structure. At present it holds only data for custom
560 typedef struct pcre2_real_general_context {
562 } pcre2_real_general_context;
564 /* The real compile context structure */
566 typedef struct pcre2_real_compile_context {
568 int (*stack_guard)(uint32_t, void *);
569 void *stack_guard_data;
570 const uint8_t *tables;
571 PCRE2_SIZE max_pattern_length;
572 uint16_t bsr_convention;
573 uint16_t newline_convention;
574 uint32_t parens_nest_limit;
575 uint32_t extra_options;
576 } pcre2_real_compile_context;
578 /* The real match context structure. */
580 typedef struct pcre2_real_match_context {
583 pcre2_jit_callback jit_callback;
584 void *jit_callback_data;
586 int (*callout)(pcre2_callout_block *, void *);
588 PCRE2_SIZE offset_limit;
590 uint32_t match_limit;
591 uint32_t depth_limit;
592 } pcre2_real_match_context;
594 /* The real convert context structure. */
596 typedef struct pcre2_real_convert_context {
598 uint32_t glob_separator;
599 uint32_t glob_escape;
600 } pcre2_real_convert_context;
602 /* The real compiled code structure. The type for the blocksize field is
603 defined specially because it is required in pcre2_serialize_decode() when
604 copying the size from possibly unaligned memory into a variable of the same
605 type. Use a macro rather than a typedef to avoid compiler warnings when this
606 file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
607 largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
608 argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
611 #undef CODE_BLOCKSIZE_TYPE
612 #define CODE_BLOCKSIZE_TYPE size_t
614 #undef LOOKBEHIND_MAX
615 #define LOOKBEHIND_MAX UINT16_MAX
617 typedef struct pcre2_real_code {
618 pcre2_memctl memctl; /* Memory control fields */
619 const uint8_t *tables; /* The character tables */
620 void *executable_jit; /* Pointer to JIT code */
621 uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */
622 CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */
623 uint32_t magic_number; /* Paranoid and endianness check */
624 uint32_t compile_options; /* Options passed to pcre2_compile() */
625 uint32_t overall_options; /* Options after processing the pattern */
626 uint32_t extra_options; /* Taken from compile_context */
627 uint32_t flags; /* Various state flags */
628 uint32_t limit_heap; /* Limit set in the pattern */
629 uint32_t limit_match; /* Limit set in the pattern */
630 uint32_t limit_depth; /* Limit set in the pattern */
631 uint32_t first_codeunit; /* Starting code unit */
632 uint32_t last_codeunit; /* This codeunit must be seen */
633 uint16_t bsr_convention; /* What \R matches */
634 uint16_t newline_convention; /* What is a newline? */
635 uint16_t max_lookbehind; /* Longest lookbehind (characters) */
636 uint16_t minlength; /* Minimum length of match */
637 uint16_t top_bracket; /* Highest numbered group */
638 uint16_t top_backref; /* Highest numbered back reference */
639 uint16_t name_entry_size; /* Size (code units) of table entries */
640 uint16_t name_count; /* Number of name entries in the table */
643 /* The real match data structure. Define ovector as large as it can ever
644 actually be so that array bound checkers don't grumble. Memory for this
645 structure is obtained by calling pcre2_match_data_create(), which sets the size
646 as the offset of ovector plus a pair of elements for each capturable string, so
647 the size varies from call to call. As the maximum number of capturing
648 subpatterns is 65535 we must allow for 65536 strings to include the overall
649 match. (See also the heapframe structure below.) */
651 typedef struct pcre2_real_match_data {
653 const pcre2_real_code *code; /* The pattern used for the match */
654 PCRE2_SPTR subject; /* The subject that was matched */
655 PCRE2_SPTR mark; /* Pointer to last mark */
656 PCRE2_SIZE leftchar; /* Offset to leftmost code unit */
657 PCRE2_SIZE rightchar; /* Offset to rightmost code unit */
658 PCRE2_SIZE startchar; /* Offset to starting code unit */
659 uint16_t matchedby; /* Type of match (normal, JIT, DFA) */
660 uint16_t oveccount; /* Number of pairs */
661 int rc; /* The return code from the match */
662 PCRE2_SIZE ovector[131072]; /* Must be last in the structure */
663 } pcre2_real_match_data;
666 /* ----------------------- PRIVATE STRUCTURES ----------------------------- */
668 /* These structures are not needed for pcre2test. */
670 #ifndef PCRE2_PCRE2TEST
672 /* Structures for checking for mutual recursion when scanning compiled or
675 typedef struct recurse_check {
676 struct recurse_check *prev;
680 typedef struct parsed_recurse_check {
681 struct parsed_recurse_check *prev;
683 } parsed_recurse_check;
685 /* Structure for building a cache when filling in recursion offsets. */
687 typedef struct recurse_cache {
692 /* Structure for maintaining a chain of pointers to the currently incomplete
693 branches, for testing for left recursion while compiling. */
695 typedef struct branch_chain {
696 struct branch_chain *outer;
697 PCRE2_UCHAR *current_branch;
700 /* Structure for building a list of named groups during the first pass of
703 typedef struct named_group {
704 PCRE2_SPTR name; /* Points to the name in the pattern */
705 uint32_t number; /* Group number */
706 uint16_t length; /* Length of the name */
707 uint16_t isdup; /* TRUE if a duplicate */
710 /* Structure for passing "static" information around between the functions
711 doing the compiling, so that they are thread-safe. */
713 typedef struct compile_block {
714 pcre2_real_compile_context *cx; /* Points to the compile context */
715 const uint8_t *lcc; /* Points to lower casing table */
716 const uint8_t *fcc; /* Points to case-flipping table */
717 const uint8_t *cbits; /* Points to character type table */
718 const uint8_t *ctypes; /* Points to table of type maps */
719 PCRE2_SPTR start_workspace; /* The start of working space */
720 PCRE2_SPTR start_code; /* The start of the compiled code */
721 PCRE2_SPTR start_pattern; /* The start of the pattern */
722 PCRE2_SPTR end_pattern; /* The end of the pattern */
723 PCRE2_UCHAR *name_table; /* The name/number table */
724 PCRE2_SIZE workspace_size; /* Size of workspace */
725 PCRE2_SIZE small_ref_offset[10]; /* Offsets for \1 to \9 */
726 PCRE2_SIZE erroroffset; /* Offset of error in pattern */
727 uint16_t names_found; /* Number of entries so far */
728 uint16_t name_entry_size; /* Size of each entry */
729 uint16_t parens_depth; /* Depth of nested parentheses */
730 uint16_t assert_depth; /* Depth of nested assertions */
731 open_capitem *open_caps; /* Chain of open capture items */
732 named_group *named_groups; /* Points to vector in pre-compile */
733 uint32_t named_group_list_size; /* Number of entries in the list */
734 uint32_t external_options; /* External (initial) options */
735 uint32_t external_flags; /* External flag bits to be set */
736 uint32_t bracount; /* Count of capturing parentheses */
737 uint32_t lastcapture; /* Last capture encountered */
738 uint32_t *parsed_pattern; /* Parsed pattern buffer */
739 uint32_t *parsed_pattern_end; /* Parsed pattern should not get here */
740 uint32_t *groupinfo; /* Group info vector */
741 uint32_t top_backref; /* Maximum back reference */
742 uint32_t backref_map; /* Bitmap of low back refs */
743 uint32_t nltype; /* Newline type */
744 uint32_t nllen; /* Newline string length */
745 uint32_t class_range_start; /* Overall class range start */
746 uint32_t class_range_end; /* Overall class range end */
747 PCRE2_UCHAR nl[4]; /* Newline string when fixed length */
748 int max_lookbehind; /* Maximum lookbehind (characters) */
749 int req_varyopt; /* "After variable item" flag for reqbyte */
750 BOOL had_accept; /* (*ACCEPT) encountered */
751 BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
752 BOOL had_recurse; /* Had a recursion or subroutine call */
753 BOOL dupnames; /* Duplicate names exist */
756 /* Structure for keeping the properties of the in-memory stack used
757 by the JIT matcher. */
759 typedef struct pcre2_real_jit_stack {
762 } pcre2_real_jit_stack;
764 /* Structure for items in a linked list that represents an explicit recursive
765 call within the pattern when running pcre_dfa_match(). */
767 typedef struct dfa_recursion_info {
768 struct dfa_recursion_info *prevrec;
769 PCRE2_SPTR subject_position;
771 } dfa_recursion_info;
773 /* Structure for "stack" frames that are used for remembering backtracking
774 positions during matching. As these are used in a vector, with the ovector item
775 being extended, the size of the structure must be a multiple of PCRE2_SIZE. The
776 only way to check this at compile time is to force an error by generating an
777 array with a negative size. By putting this in a typedef (which is never used),
778 we don't generate any code when all is well. */
780 typedef struct heapframe {
782 /* The first set of fields are variables that have to be preserved over calls
783 to RRMATCH(), but which do not need to be copied to new frames. */
785 PCRE2_SPTR ecode; /* The current position in the pattern */
786 PCRE2_SPTR temp_sptr[2]; /* Used for short-term PCRE_SPTR values */
787 PCRE2_SIZE length; /* Used for character, string, or code lengths */
788 PCRE2_SIZE back_frame; /* Amount to subtract on RRETURN */
789 PCRE2_SIZE temp_size; /* Used for short-term PCRE2_SIZE values */
790 uint32_t rdepth; /* "Recursion" depth */
791 uint32_t group_frame_type; /* Type information for group frames */
792 uint32_t temp_32[4]; /* Used for short-term 32-bit or BOOL values */
793 uint8_t return_id; /* Where to go on in internal "return" */
794 uint8_t op; /* Processing opcode */
796 /* At this point, the structure is 16-bit aligned. On most architectures
797 the alignment requirement for a pointer will ensure that the eptr field below
798 is 32-bit or 64-bit aligned. However, on m68k it is fine to have a pointer
799 that is 16-bit aligned. We must therefore ensure that what comes between here
800 and eptr is an odd multiple of 16 bits so as to get back into 32-bit
801 alignment. This happens naturally when PCRE2_UCHAR is 8 bits wide, but needs
802 fudges in the other cases. In the 32-bit case the padding comes first so that
803 the occu field itself is 32-bit aligned. Without the padding, this structure
804 is no longer a multiple of PCRE2_SIZE on m68k, and the check below fails. */
806 #if PCRE2_CODE_UNIT_WIDTH == 8
807 PCRE2_UCHAR occu[6]; /* Used for other case code units */
808 #elif PCRE2_CODE_UNIT_WIDTH == 16
809 PCRE2_UCHAR occu[2]; /* Used for other case code units */
810 uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */
812 uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */
813 PCRE2_UCHAR occu[1]; /* Used for other case code units */
816 /* The rest have to be copied from the previous frame whenever a new frame
817 becomes current. The final field is specified as a large vector so that
818 runtime array bound checks don't catch references to it. However, for any
819 specific call to pcre2_match() the memory allocated for each frame structure
820 allows for exactly the right size ovector for the number of capturing
821 parentheses. (See also the comment for pcre2_real_match_data above.) */
823 PCRE2_SPTR eptr; /* MUST BE FIRST */
824 PCRE2_SPTR start_match; /* Can be adjusted by \K */
825 PCRE2_SPTR mark; /* Most recent mark on the success path */
826 uint32_t current_recurse; /* Current (deepest) recursion number */
827 uint32_t capture_last; /* Most recent capture */
828 PCRE2_SIZE last_group_offset; /* Saved offset to most recent group frame */
829 PCRE2_SIZE offset_top; /* Offset after highest capture */
830 PCRE2_SIZE ovector[131072]; /* Must be last in the structure */
833 /* This typedef is a check that the size of the heapframe structure is a
834 multiple of PCRE2_SIZE. See various comments above. */
836 typedef char check_heapframe_size[
837 ((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0)? (+1):(-1)];
839 /* Structure for passing "static" information around between the functions
840 doing traditional NFA matching (pcre2_match() and friends). */
842 typedef struct match_block {
843 pcre2_memctl memctl; /* For general use */
844 PCRE2_SIZE frame_vector_size; /* Size of a backtracking frame */
845 heapframe *match_frames; /* Points to vector of frames */
846 heapframe *match_frames_top; /* Points after the end of the vector */
847 heapframe *stack_frames; /* The original vector on the stack */
848 PCRE2_SIZE heap_limit; /* As it says */
849 uint32_t match_limit; /* As it says */
850 uint32_t match_limit_depth; /* As it says */
851 uint32_t match_call_count; /* Number of times a new frame is created */
852 BOOL hitend; /* Hit the end of the subject at some point */
853 BOOL hasthen; /* Pattern contains (*THEN) */
854 const uint8_t *lcc; /* Points to lower casing table */
855 const uint8_t *fcc; /* Points to case-flipping table */
856 const uint8_t *ctypes; /* Points to table of type maps */
857 PCRE2_SIZE start_offset; /* The start offset value */
858 PCRE2_SIZE end_offset_top; /* Highwater mark at end of match */
859 uint16_t partial; /* PARTIAL options */
860 uint16_t bsr_convention; /* \R interpretation */
861 uint16_t name_count; /* Number of names in name table */
862 uint16_t name_entry_size; /* Size of entry in names table */
863 PCRE2_SPTR name_table; /* Table of group names */
864 PCRE2_SPTR start_code; /* For use when recursing */
865 PCRE2_SPTR start_subject; /* Start of the subject string */
866 PCRE2_SPTR end_subject; /* End of the subject string */
867 PCRE2_SPTR end_match_ptr; /* Subject position at end match */
868 PCRE2_SPTR start_used_ptr; /* Earliest consulted character */
869 PCRE2_SPTR last_used_ptr; /* Latest consulted character */
870 PCRE2_SPTR mark; /* Mark pointer to pass back on success */
871 PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */
872 PCRE2_SPTR verb_ecode_ptr; /* For passing back info */
873 PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */
874 uint32_t verb_current_recurse; /* Current recurse when (*VERB) happens */
875 uint32_t moptions; /* Match options */
876 uint32_t poptions; /* Pattern options */
877 uint32_t skip_arg_count; /* For counting SKIP_ARGs */
878 uint32_t ignore_skip_arg; /* For re-run when SKIP arg name not found */
879 uint32_t nltype; /* Newline type */
880 uint32_t nllen; /* Newline string length */
881 PCRE2_UCHAR nl[4]; /* Newline string when fixed */
882 pcre2_callout_block *cb; /* Points to a callout block */
883 void *callout_data; /* To pass back to callouts */
884 int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */
887 /* A similar structure is used for the same purpose by the DFA matching
890 typedef struct dfa_match_block {
891 pcre2_memctl memctl; /* For general use */
892 PCRE2_SPTR start_code; /* Start of the compiled pattern */
893 PCRE2_SPTR start_subject ; /* Start of the subject string */
894 PCRE2_SPTR end_subject; /* End of subject string */
895 PCRE2_SPTR start_used_ptr; /* Earliest consulted character */
896 PCRE2_SPTR last_used_ptr; /* Latest consulted character */
897 const uint8_t *tables; /* Character tables */
898 PCRE2_SIZE start_offset; /* The start offset value */
899 PCRE2_SIZE heap_limit; /* As it says */
900 PCRE2_SIZE heap_used; /* As it says */
901 uint32_t match_limit; /* As it says */
902 uint32_t match_limit_depth; /* As it says */
903 uint32_t match_call_count; /* Number of calls of internal function */
904 uint32_t moptions; /* Match options */
905 uint32_t poptions; /* Pattern options */
906 uint32_t nltype; /* Newline type */
907 uint32_t nllen; /* Newline string length */
908 PCRE2_UCHAR nl[4]; /* Newline string when fixed */
909 uint16_t bsr_convention; /* \R interpretation */
910 pcre2_callout_block *cb; /* Points to a callout block */
911 void *callout_data; /* To pass back to callouts */
912 int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */
913 dfa_recursion_info *recursive; /* Linked list of recursion data */
916 #endif /* PCRE2_PCRE2TEST */
918 /* End of pcre2_intmodedep.h */