1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2018 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
41 /* This module contains functions that scan a compiled pattern and change
42 repeats into possessive repeats where possible. */
50 #include "pcre2_internal.h"
53 /*************************************************
54 * Tables for auto-possessification *
55 *************************************************/
57 /* This table is used to check whether auto-possessification is possible
58 between adjacent character-type opcodes. The left-hand (repeated) opcode is
59 used to select the row, and the right-hand opcode is use to select the column.
60 A value of 1 means that auto-possessification is OK. For example, the second
61 value in the first row means that \D+\d can be turned into \D++\d.
63 The Unicode property types (\P and \p) have to be present to fill out the table
64 because of what their opcode values are, but the table values should always be
65 zero because property types are handled separately in the code. The last four
66 columns apply to items that cannot be repeated, so there is no need to have
67 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
68 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
70 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
71 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
73 static const uint8_t autoposstab[APTROWS][APTCOLS] = {
74 /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
75 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
76 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
77 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
78 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
79 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
80 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
81 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
82 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
83 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
84 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
85 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
86 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
87 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
88 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
89 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
90 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
91 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
94 #ifdef SUPPORT_UNICODE
95 /* This table is used to check whether auto-possessification is possible
96 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
97 left-hand (repeated) opcode is used to select the row, and the right-hand
98 opcode is used to select the column. The values are as follows:
100 0 Always return FALSE (never auto-possessify)
101 1 Character groups are distinct (possessify if both are OP_PROP)
102 2 Check character categories in the same group (general or particular)
103 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
105 4 Check left general category vs right particular category
106 5 Check right general category vs left particular category
108 6 Left alphanum vs right general category
109 7 Left space vs right general category
110 8 Left word vs right general category
112 9 Right alphanum vs left general category
113 10 Right space vs left general category
114 11 Right word vs left general category
116 12 Left alphanum vs right particular category
117 13 Left space vs right particular category
118 14 Left word vs right particular category
120 15 Right alphanum vs left particular category
121 16 Right space vs left particular category
122 17 Right word vs left particular category
125 static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
126 /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
127 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
128 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
129 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
130 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
131 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
132 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
133 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
134 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
135 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
136 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
137 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
140 /* This table is used to check whether auto-possessification is possible
141 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
142 specifies a general category and the other specifies a particular category. The
143 row is selected by the general category and the column by the particular
144 category. The value is 1 if the particular category is not part of the general
147 static const uint8_t catposstab[7][30] = {
148 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
149 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
150 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
151 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
152 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
153 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
154 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
155 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
158 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
159 a general or particular category. The properties in each row are those
160 that apply to the character set in question. Duplication means that a little
161 unnecessary work is done when checking, but this keeps things much simpler
162 because they can all use the same code. For more details see the comment where
165 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
166 "space", but from Perl 5.18 it's included, so both categories are treated the
169 static const uint8_t posspropstab[3][4] = {
170 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
171 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
172 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
174 #endif /* SUPPORT_UNICODE */
178 #ifdef SUPPORT_UNICODE
179 /*************************************************
180 * Check a character and a property *
181 *************************************************/
183 /* This function is called by compare_opcodes() when a property item is
184 adjacent to a fixed character.
188 ptype the property type
189 pdata the data for the type
190 negated TRUE if it's a negated property (\P or \p{^)
192 Returns: TRUE if auto-possessifying is OK
196 check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
200 const ucd_record *prop = GET_UCD(c);
205 return (prop->chartype == ucp_Lu ||
206 prop->chartype == ucp_Ll ||
207 prop->chartype == ucp_Lt) == negated;
210 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
213 return (pdata == prop->chartype) == negated;
216 return (pdata == prop->script) == negated;
218 /* These are specials */
221 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
222 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
224 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
225 means that Perl space and POSIX space are now identical. PCRE was changed
228 case PT_SPACE: /* Perl space */
229 case PT_PXSPACE: /* POSIX space */
237 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
239 break; /* Control never reaches here */
242 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
243 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
244 c == CHAR_UNDERSCORE) == negated;
247 p = PRIV(ucd_caseless_sets) + prop->caseset;
250 if (c < *p) return !negated;
251 if (c == *p++) return negated;
253 break; /* Control never reaches here */
258 #endif /* SUPPORT_UNICODE */
262 /*************************************************
263 * Base opcode of repeated opcodes *
264 *************************************************/
266 /* Returns the base opcode for repeated single character type opcodes. If the
267 opcode is not a repeated character type, it returns with the original value.
270 Returns: base opcode for the type
274 get_repeat_base(PCRE2_UCHAR c)
276 return (c > OP_TYPEPOSUPTO)? c :
277 (c >= OP_TYPESTAR)? OP_TYPESTAR :
278 (c >= OP_NOTSTARI)? OP_NOTSTARI :
279 (c >= OP_NOTSTAR)? OP_NOTSTAR :
280 (c >= OP_STARI)? OP_STARI :
285 /*************************************************
286 * Fill the character property list *
287 *************************************************/
289 /* Checks whether the code points to an opcode that can take part in auto-
290 possessification, and if so, fills a list with its properties.
293 code points to start of expression
294 utf TRUE if in UTF mode
295 fcc points to the case-flipping table
296 list points to output list
297 list[0] will be filled with the opcode
298 list[1] will be non-zero if this opcode
299 can match an empty character string
300 list[2..7] depends on the opcode
302 Returns: points to the start of the next opcode if *code is accepted
303 NULL if *code is not accepted
307 get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
310 PCRE2_UCHAR c = *code;
315 #ifdef SUPPORT_UNICODE
316 uint32_t *clist_dest;
317 const uint32_t *clist_src;
319 (void)utf; /* Suppress "unused parameter" compiler warning */
326 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
328 base = get_repeat_base(c);
329 c -= (base - OP_STAR);
331 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
334 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT &&
367 case OP_NOT_WHITESPACE:
369 case OP_NOT_WORDCHAR:
387 GETCHARINCTEST(chr, code);
394 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
395 GETCHARINCTEST(chr, code);
398 #ifdef SUPPORT_UNICODE
399 if (chr < 128 || (chr < 256 && !utf))
402 list[3] = UCD_OTHERCASE(chr);
403 #elif defined SUPPORT_WIDE_CHARS
404 list[3] = (chr < 256) ? fcc[chr] : chr;
409 /* The othercase might be the same value. */
417 #ifdef SUPPORT_UNICODE
420 if (code[0] != PT_CLIST)
427 /* Convert only if we have enough space. */
429 clist_src = PRIV(ucd_caseless_sets) + code[1];
430 clist_dest = list + 2;
434 if (clist_dest >= list + 8)
436 /* Early return if there is not enough space. This should never
437 happen, since all clists are shorter than 5 character now. */
442 *clist_dest++ = *clist_src;
444 while(*clist_src++ != NOTACHAR);
446 /* All characters are stored. The terminating NOTACHAR is copied from the
449 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
455 #ifdef SUPPORT_WIDE_CHARS
458 end = code + GET(code, 0) - 1;
461 end = code + 32 / sizeof(PCRE2_UCHAR);
484 list[1] = (GET2(end, 1) == 0);
485 end += 1 + 2 * IMM2_SIZE;
488 list[2] = (uint32_t)(end - code);
491 return NULL; /* Opcode not accepted */
496 /*************************************************
497 * Scan further character sets for match *
498 *************************************************/
500 /* Checks whether the base and the current opcode have a common character, in
501 which case the base cannot be possessified.
504 code points to the byte code
506 cb compile data block
507 base_list the data list of the base opcode
508 base_end the end of the base opcode
509 rec_limit points to recursion depth counter
511 Returns: TRUE if the auto-possessification is possible
515 compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
516 const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
520 const uint32_t *chr_ptr;
521 const uint32_t *ochr_ptr;
522 const uint32_t *list_ptr;
523 PCRE2_SPTR next_code;
524 #ifdef SUPPORT_WIDE_CHARS
525 PCRE2_SPTR xclass_flags;
527 const uint8_t *class_bitset;
528 const uint8_t *set1, *set2, *set_end;
530 BOOL accepted, invert_bits;
531 BOOL entered_a_group = FALSE;
533 if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */
535 /* Note: the base_list[1] contains whether the current opcode has a greedy
536 (represented by a non-zero value) quantifier. This is a different from
537 other character type lists, which store here that the character iterator
538 matches to an empty string (also represented by a non-zero value). */
542 /* All operations move the code pointer forward.
543 Therefore infinite recursions are not possible. */
547 /* Skip over callouts */
551 code += PRIV(OP_lengths)[c];
555 if (c == OP_CALLOUT_STR)
557 code += GET(code, 1 + 2*LINK_SIZE);
561 /* At the end of a branch, skip to the end of the group. */
565 do code += GET(code, 1); while (*code == OP_ALT);
569 /* Inspect the next opcode. */
573 /* We can always possessify a greedy iterator at the end of the pattern,
574 which is reached after skipping over the final OP_KET. A non-greedy
575 iterator must never be possessified. */
578 return base_list[1] != 0;
580 /* When an iterator is at the end of certain kinds of group we can inspect
581 what follows the group by skipping over the closing ket. Note that this
582 does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
583 iteration is variable (could be another iteration or could be the next
584 item). As these two opcodes are not listed in the next switch, they will
585 end up as the next code to inspect, and return FALSE by virtue of being
590 /* The non-greedy case cannot be converted to a possessive form. */
592 if (base_list[1] == 0) return FALSE;
594 /* If the bracket is capturing it might be referenced by an OP_RECURSE
595 so its last iterator can never be possessified if the pattern contains
596 recursions. (This could be improved by keeping a list of group numbers that
597 are called by recursion.) */
599 switch(*(code - GET(code, 1)))
605 if (cb->had_recurse) return FALSE;
608 /* Atomic sub-patterns and assertions can always auto-possessify their
609 last iterator. However, if the group was entered as a result of checking
610 a previous iterator, this is not possible. */
615 case OP_ASSERTBACK_NOT:
618 return !entered_a_group;
621 /* Skip over the bracket and inspect what comes next. */
623 code += PRIV(OP_lengths)[c];
626 /* Handle cases where the next item is a group. */
631 next_code = code + GET(code, 1);
632 code += PRIV(OP_lengths)[c];
634 /* Check each branch. We have to recurse a level for all but the last
637 while (*next_code == OP_ALT)
639 if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
641 code = next_code + 1 + LINK_SIZE;
642 next_code += GET(next_code, 1);
645 entered_a_group = TRUE;
651 next_code = code + 1;
652 if (*next_code != OP_BRA && *next_code != OP_CBRA &&
653 *next_code != OP_ONCE) return FALSE;
655 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
657 /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
659 next_code += 1 + LINK_SIZE;
660 if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
663 code += PRIV(OP_lengths)[c];
666 /* The next opcode does not need special handling; fall through and use it
667 to see if the base can be possessified. */
673 /* We now have the next appropriate opcode to compare with the base. Check
674 for a supported opcode, and load its properties. */
676 code = get_chr_property_list(code, utf, cb->fcc, list);
677 if (code == NULL) return FALSE; /* Unsupported */
679 /* If either opcode is a small character list, set pointers for comparing
680 characters from that list with another list, or with a property. */
682 if (base_list[0] == OP_CHAR)
684 chr_ptr = base_list + 2;
687 else if (list[0] == OP_CHAR)
690 list_ptr = base_list;
693 /* Character bitsets can also be compared to certain opcodes. */
695 else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
696 #if PCRE2_CODE_UNIT_WIDTH == 8
697 /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
698 || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
702 #if PCRE2_CODE_UNIT_WIDTH == 8
703 if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
705 if (base_list[0] == OP_CLASS)
708 set1 = (uint8_t *)(base_end - base_list[2]);
713 set1 = (uint8_t *)(code - list[2]);
714 list_ptr = base_list;
723 ((list_ptr == list ? code : base_end) - list_ptr[2]);
726 #ifdef SUPPORT_WIDE_CHARS
728 xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
729 if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
730 if ((*xclass_flags & XCL_MAP) == 0)
732 /* No bits are set for characters < 256. */
733 if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0;
734 /* Might be an empty repeat. */
737 set2 = (uint8_t *)(xclass_flags + 1);
745 set2 = (uint8_t *)(cb->cbits + cbit_digit);
748 case OP_NOT_WHITESPACE:
752 set2 = (uint8_t *)(cb->cbits + cbit_space);
755 case OP_NOT_WORDCHAR:
759 set2 = (uint8_t *)(cb->cbits + cbit_word);
766 /* Because the bit sets are unaligned bytes, we need to perform byte
774 if ((*set1++ & ~(*set2++)) != 0) return FALSE;
776 while (set1 < set_end);
782 if ((*set1++ & *set2++) != 0) return FALSE;
784 while (set1 < set_end);
787 if (list[1] == 0) return TRUE;
788 /* Might be an empty repeat. */
792 /* Some property combinations also acceptable. Unicode property opcodes are
793 processed specially; the rest can be handled with a lookup table. */
797 uint32_t leftop, rightop;
799 leftop = base_list[0];
802 #ifdef SUPPORT_UNICODE
803 accepted = FALSE; /* Always set in non-unicode case. */
804 if (leftop == OP_PROP || leftop == OP_NOTPROP)
806 if (rightop == OP_EOD)
808 else if (rightop == OP_PROP || rightop == OP_NOTPROP)
812 BOOL same = leftop == rightop;
813 BOOL lisprop = leftop == OP_PROP;
814 BOOL risprop = rightop == OP_PROP;
815 BOOL bothprop = lisprop && risprop;
817 /* There's a table that specifies how each combination is to be
819 0 Always return FALSE (never auto-possessify)
820 1 Character groups are distinct (possessify if both are OP_PROP)
821 2 Check character categories in the same group (general or particular)
822 3 Return TRUE if the two opcodes are not the same
823 ... see comments below
826 n = propposstab[base_list[2]][list[2]];
830 case 1: accepted = bothprop; break;
831 case 2: accepted = (base_list[3] == list[3]) != same; break;
832 case 3: accepted = !same; break;
834 case 4: /* Left general category, right particular category */
835 accepted = risprop && catposstab[base_list[3]][list[3]] == same;
838 case 5: /* Right general category, left particular category */
839 accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
842 /* This code is logically tricky. Think hard before fiddling with it.
843 The posspropstab table has four entries per row. Each row relates to
844 one of PCRE's special properties such as ALNUM or SPACE or WORD.
845 Only WORD actually needs all four entries, but using repeats for the
846 others means they can all use the same code below.
848 The first two entries in each row are Unicode general categories, and
849 apply always, because all the characters they include are part of the
850 PCRE character set. The third and fourth entries are a general and a
851 particular category, respectively, that include one or more relevant
852 characters. One or the other is used, depending on whether the check
853 is for a general or a particular category. However, in both cases the
854 category contains more characters than the specials that are defined
855 for the property being tested against. Therefore, it cannot be used
858 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
859 Underscore is covered by ucp_P or ucp_Po. */
861 case 6: /* Left alphanum vs right general category */
862 case 7: /* Left space vs right general category */
863 case 8: /* Left word vs right general category */
864 p = posspropstab[n-6];
865 accepted = risprop && lisprop ==
868 (list[3] != p[2] || !lisprop));
871 case 9: /* Right alphanum vs left general category */
872 case 10: /* Right space vs left general category */
873 case 11: /* Right word vs left general category */
874 p = posspropstab[n-9];
875 accepted = lisprop && risprop ==
876 (base_list[3] != p[0] &&
877 base_list[3] != p[1] &&
878 (base_list[3] != p[2] || !risprop));
881 case 12: /* Left alphanum vs right particular category */
882 case 13: /* Left space vs right particular category */
883 case 14: /* Left word vs right particular category */
884 p = posspropstab[n-12];
885 accepted = risprop && lisprop ==
886 (catposstab[p[0]][list[3]] &&
887 catposstab[p[1]][list[3]] &&
888 (list[3] != p[3] || !lisprop));
891 case 15: /* Right alphanum vs left particular category */
892 case 16: /* Right space vs left particular category */
893 case 17: /* Right word vs left particular category */
894 p = posspropstab[n-15];
895 accepted = lisprop && risprop ==
896 (catposstab[p[0]][base_list[3]] &&
897 catposstab[p[1]][base_list[3]] &&
898 (base_list[3] != p[3] || !risprop));
905 #endif /* SUPPORT_UNICODE */
907 accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
908 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
909 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
911 if (!accepted) return FALSE;
913 if (list[1] == 0) return TRUE;
914 /* Might be an empty repeat. */
918 /* Control reaches here only if one of the items is a small character list.
919 All characters are checked against the other side. */
928 ochr_ptr = list_ptr + 2;
931 if (chr == *ochr_ptr) return FALSE;
934 while(*ochr_ptr != NOTACHAR);
938 ochr_ptr = list_ptr + 2;
941 if (chr == *ochr_ptr)
945 while(*ochr_ptr != NOTACHAR);
946 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
949 /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not*
950 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
953 if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE;
957 if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE;
961 if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE;
964 case OP_NOT_WHITESPACE:
965 if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE;
969 if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE;
972 case OP_NOT_WORDCHAR:
973 if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE;
979 HSPACE_CASES: return FALSE;
988 default: return FALSE;
996 VSPACE_CASES: return FALSE;
1004 VSPACE_CASES: break;
1005 default: return FALSE;
1021 #endif /* Not EBCDIC */
1026 case OP_EOD: /* Can always possessify before \z */
1029 #ifdef SUPPORT_UNICODE
1032 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
1033 list_ptr[0] == OP_NOTPROP))
1039 if (chr > 255) return FALSE;
1043 if (chr > 255) break;
1044 class_bitset = (uint8_t *)
1045 ((list_ptr == list ? code : base_end) - list_ptr[2]);
1046 if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
1049 #ifdef SUPPORT_WIDE_CHARS
1051 if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
1052 list_ptr[2] + LINK_SIZE, utf)) return FALSE;
1062 while(*chr_ptr != NOTACHAR);
1064 /* At least one character must be matched from this opcode. */
1066 if (list[1] == 0) return TRUE;
1069 /* Control never reaches here. There used to be a fail-save return FALSE; here,
1070 but some compilers complain about an unreachable statement. */
1075 /*************************************************
1076 * Scan compiled regex for auto-possession *
1077 *************************************************/
1079 /* Replaces single character iterations with their possessive alternatives
1080 if appropriate. This function modifies the compiled opcode! Hitting a
1081 non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a
1082 bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches
1083 overly complicated or large patterns. In these cases, the check just stops,
1084 leaving the remainder of the pattern unpossessified.
1087 code points to start of the byte code
1088 utf TRUE in UTF mode
1089 cb compile data block
1091 Returns: 0 for success
1092 -1 if a non-existant opcode is encountered
1096 PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
1100 PCRE2_UCHAR *repeat_opcode;
1102 int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */
1108 if (c >= OP_TABLE_LENGTH) return -1; /* Something gone wrong */
1110 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
1112 c -= get_repeat_base(c) - OP_STAR;
1113 end = (c <= OP_MINUPTO) ?
1114 get_chr_property_list(code, utf, cb->fcc, list) : NULL;
1115 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
1117 if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
1122 *code += OP_POSSTAR - OP_STAR;
1126 *code += OP_POSSTAR - OP_MINSTAR;
1130 *code += OP_POSPLUS - OP_PLUS;
1134 *code += OP_POSPLUS - OP_MINPLUS;
1138 *code += OP_POSQUERY - OP_QUERY;
1142 *code += OP_POSQUERY - OP_MINQUERY;
1146 *code += OP_POSUPTO - OP_UPTO;
1150 *code += OP_POSUPTO - OP_MINUPTO;
1156 else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
1158 #ifdef SUPPORT_WIDE_CHARS
1160 repeat_opcode = code + GET(code, 1);
1163 repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
1166 if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
1168 /* end must not be NULL. */
1169 end = get_chr_property_list(code, utf, cb->fcc, list);
1171 list[1] = (c & 1) == 0;
1173 if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
1179 *repeat_opcode = OP_CRPOSSTAR;
1184 *repeat_opcode = OP_CRPOSPLUS;
1189 *repeat_opcode = OP_CRPOSQUERY;
1194 *repeat_opcode = OP_CRPOSRANGE;
1208 case OP_TYPEMINSTAR:
1210 case OP_TYPEMINPLUS:
1212 case OP_TYPEMINQUERY:
1213 case OP_TYPEPOSSTAR:
1214 case OP_TYPEPOSPLUS:
1215 case OP_TYPEPOSQUERY:
1216 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1220 case OP_TYPEMINUPTO:
1222 case OP_TYPEPOSUPTO:
1223 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1227 case OP_CALLOUT_STR:
1228 code += GET(code, 1 + 2*LINK_SIZE);
1231 #ifdef SUPPORT_WIDE_CHARS
1233 code += GET(code, 1);
1246 /* Add in the fixed length from the table */
1248 code += PRIV(OP_lengths)[c];
1250 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
1251 followed by a multi-byte character. The length in the table is a minimum, so
1252 we have to arrange to skip the extra code units. */
1254 #ifdef MAYBE_UTF_MULTI
1292 case OP_NOTMINQUERY:
1298 case OP_NOTPOSQUERY:
1301 case OP_NOTMINSTARI:
1303 case OP_NOTMINPLUSI:
1305 case OP_NOTMINQUERYI:
1307 case OP_NOTMINUPTOI:
1309 case OP_NOTPOSSTARI:
1310 case OP_NOTPOSPLUSI:
1311 case OP_NOTPOSQUERYI:
1312 case OP_NOTPOSUPTOI:
1313 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1317 (void)(utf); /* Keep compiler happy by referencing function argument */
1318 #endif /* SUPPORT_WIDE_CHARS */
1322 /* End of pcre2_auto_possess.c */