1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2018 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
41 /* This module contains functions for scanning a compiled pattern and
42 collecting data (e.g. minimum matching length). */
49 #include "pcre2_internal.h"
51 /* The maximum remembered capturing brackets minimum. */
53 #define MAX_CACHE_BACKREF 128
55 /* Set a bit in the starting code unit bit map. */
57 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
59 /* Returns from set_start_bits() */
61 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
64 /*************************************************
65 * Find the minimum subject length for a group *
66 *************************************************/
68 /* Scan a parenthesized group and compute the minimum length of subject that
69 is needed to match it. This is a lower bound; it does not mean there is a
70 string of that length that matches. In UTF mode, the result is in characters
71 rather than code units. The field in a compiled pattern for storing the minimum
72 length is 16-bits long (on the grounds that anything longer than that is
73 pathological), so we give up when we reach that amount. This also means that
74 integer overflow for really crazy patterns cannot happen.
76 Backreference minimum lengths are cached to speed up multiple references. This
77 function is called only when the highest back reference in the pattern is less
78 than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79 caching vector. The zeroth element contains the number of the highest set
83 re compiled pattern block
84 code pointer to start of group (the bracket)
85 startcode pointer to start of the whole pattern's code
87 recurses chain of recurse_check to catch mutual recursion
88 countptr pointer to call count (to catch over complexity)
89 backref_cache vector for caching back references.
91 Returns: the minimum length
94 or pattern too complicated
95 or back reference to duplicate name/number
96 -2 internal error (missing capturing bracket)
97 -3 internal error (opcode not listed)
101 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
102 PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
106 int prev_cap_recno = -1;
108 int prev_recurse_recno = -1;
109 int prev_recurse_d = 0;
110 uint32_t once_fudge = 0;
111 BOOL had_recurse = FALSE;
112 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
113 recurse_check this_recurse;
114 int branchlength = 0;
115 PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
117 /* If this is a "could be empty" group, its minimum length is 0. */
119 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
121 /* Skip over capturing bracket number */
123 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
125 /* A large and/or complex regex can take too long to process. */
127 if ((*countptr)++ > 1000) return -1;
129 /* Scan along the opcodes for this branch. If we get to the end of the branch,
130 check the length against that of the other branches. If the accumulated length
131 passes 16-bits, stop. */
136 PCRE2_UCHAR *cs, *ce;
137 PCRE2_UCHAR op = *cc;
139 if (branchlength >= UINT16_MAX) return UINT16_MAX;
146 /* If there is only one branch in a condition, the implied branch has zero
147 length, so we don't add anything. This covers the DEFINE "condition"
148 automatically. If there are two branches we can treat it the same as any
149 other non-capturing subpattern. */
151 cs = cc + GET(cc, 1);
154 cc = cs + 1 + LINK_SIZE;
157 goto PROCESS_NON_CAPTURE;
160 /* There's a special case of OP_BRA, when it is wrapped round a repeated
161 OP_RECURSE. We'd like to process the latter at this level so that
162 remembering the value works for repeated cases. So we do nothing, but
163 set a fudge value to skip over the OP_KET after the recurse. */
165 if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
167 once_fudge = 1 + LINK_SIZE;
178 d = find_minlength(re, cc, startcode, utf, recurses, countptr,
182 do cc += GET(cc, 1); while (*cc == OP_ALT);
186 /* To save time for repeated capturing subpatterns, we remember the
187 length of the previous one. Unfortunately we can't do the same for
188 the unnumbered ones above. Nor can we do this if (?| is present in the
189 pattern because captures with the same number are not then identical. */
195 recno = (int)GET2(cc, 1+LINK_SIZE);
196 if (dupcapused || recno != prev_cap_recno)
198 prev_cap_recno = recno;
199 prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
201 if (prev_cap_d < 0) return prev_cap_d;
203 branchlength += prev_cap_d;
204 do cc += GET(cc, 1); while (*cc == OP_ALT);
208 /* ACCEPT makes things far too complicated; we have to give up. */
211 case OP_ASSERT_ACCEPT:
214 /* Reached end of a branch; if it's a ket it is the end of a nested
215 call. If it's ALT it is an alternation in a nested call. If it is END it's
216 the end of the outer call. All can be handled by the same code. If an
217 ACCEPT was previously encountered, use the length that was in force at that
218 time, and pass back the shortest ACCEPT length. */
226 if (length < 0 || (!had_recurse && branchlength < length))
227 length = branchlength;
228 if (op != OP_ALT) return length;
234 /* Skip over assertive subpatterns */
239 case OP_ASSERTBACK_NOT:
240 do cc += GET(cc, 1); while (*cc == OP_ALT);
243 /* Skip over things that don't match chars */
261 case OP_NOT_WORD_BOUNDARY:
262 case OP_WORD_BOUNDARY:
263 cc += PRIV(OP_lengths)[*cc];
267 cc += GET(cc, 1 + 2*LINK_SIZE);
270 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
276 cc += PRIV(OP_lengths)[*cc];
277 do cc += GET(cc, 1); while (*cc == OP_ALT);
281 /* Handle literal characters and + repetitions */
301 #ifdef SUPPORT_UNICODE
302 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
310 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
313 /* Handle exact repetitions. The count is already in characters, but we
314 may need to skip over a multibyte character in UTF mode. */
320 branchlength += GET2(cc,1);
322 #ifdef SUPPORT_UNICODE
323 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
328 branchlength += GET2(cc,1);
329 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
330 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
333 /* Handle single-char non-literal matchers */
342 case OP_NOT_WHITESPACE:
344 case OP_NOT_WORDCHAR:
357 /* "Any newline" might match two characters, but it also might match just
365 /* The single-byte matcher means we can't proceed in UTF mode. (In
366 non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
367 appear, but leave the code, just in case.) */
370 #ifdef SUPPORT_UNICODE
377 /* For repeated character types, we have to test for \p and \P, which have
378 an extra two bytes of parameters. */
383 case OP_TYPEMINQUERY:
385 case OP_TYPEPOSQUERY:
386 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
387 cc += PRIV(OP_lengths)[op];
393 if (cc[1 + IMM2_SIZE] == OP_PROP
394 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
395 cc += PRIV(OP_lengths)[op];
398 /* Check a class for variable quantification */
402 #ifdef SUPPORT_WIDE_CHARS
404 /* The original code caused an unsigned overflow in 64 bit systems,
405 so now we use a conditional statement. */
409 cc += PRIV(OP_lengths)[OP_CLASS];
411 cc += PRIV(OP_lengths)[OP_CLASS];
434 branchlength += GET2(cc,1);
435 cc += 1 + 2 * IMM2_SIZE;
444 /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
445 way: we find the minimum length for the subpattern. A recursion
446 (backreference or subroutine) causes an a flag to be set that causes the
447 length of this branch to be ignored. The logic is that a recursion can only
448 make sense if there is another alternative that stops the recursing. That
449 will provide the minimum length (when no recursion happens).
451 If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
452 matches an empty string (by default it causes a matching failure), so in
453 that case we must set the minimum length to zero. */
455 /* Duplicate named pattern back reference. We cannot reliably find a length
456 for this if duplicate numbers are present in the pattern. */
460 if (dupcapused) return -1;
461 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
463 int count = GET2(cc, 1+IMM2_SIZE);
465 (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
466 GET2(cc, 1) * re->name_entry_size;
470 /* Scan all groups with the same name; find the shortest. */
475 recno = GET2(slot, 0);
477 if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
478 dd = backref_cache[recno];
481 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
482 if (cs == NULL) return -2;
483 do ce += GET(ce, 1); while (*ce == OP_ALT);
484 if (cc > cs && cc < ce) /* Simple recursion */
491 recurse_check *r = recurses;
492 for (r = recurses; r != NULL; r = r->prev)
493 if (r->group == cs) break;
494 if (r != NULL) /* Mutual recursion */
501 this_recurse.prev = recurses;
502 this_recurse.group = cs;
503 dd = find_minlength(re, cs, startcode, utf, &this_recurse,
504 countptr, backref_cache);
505 if (dd < 0) return dd;
509 backref_cache[recno] = dd;
510 for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
511 backref_cache[0] = recno;
515 if (d <= 0) break; /* No point looking at any more */
516 slot += re->name_entry_size;
520 cc += 1 + 2*IMM2_SIZE;
521 goto REPEAT_BACK_REFERENCE;
523 /* Single back reference. We cannot find a length for this if duplicate
524 numbers are present in the pattern. */
528 if (dupcapused) return -1;
530 if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
531 d = backref_cache[recno];
535 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
537 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
538 if (cs == NULL) return -2;
539 do ce += GET(ce, 1); while (*ce == OP_ALT);
540 if (cc > cs && cc < ce) /* Simple recursion */
547 recurse_check *r = recurses;
548 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
549 if (r != NULL) /* Mutual recursion */
556 this_recurse.prev = recurses;
557 this_recurse.group = cs;
558 d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
566 backref_cache[recno] = d;
567 for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
568 backref_cache[0] = recno;
573 /* Handle repeated back references */
575 REPEAT_BACK_REFERENCE:
599 cc += 1 + 2 * IMM2_SIZE;
607 /* Take care not to overflow: (1) min and d are ints, so check that their
608 product is not greater than INT_MAX. (2) branchlength is limited to
609 UINT16_MAX (checked at the top of the loop). */
611 if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
612 branchlength = UINT16_MAX;
613 else branchlength += min * d;
616 /* Recursion always refers to the first occurrence of a subpattern with a
617 given number. Therefore, we can always make use of caching, even when the
618 pattern contains multiple subpatterns with the same number. */
621 cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
622 recno = GET2(cs, 1+LINK_SIZE);
623 if (recno == prev_recurse_recno)
625 branchlength += prev_recurse_d;
629 do ce += GET(ce, 1); while (*ce == OP_ALT);
630 if (cc > cs && cc < ce) /* Simple recursion */
634 recurse_check *r = recurses;
635 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
636 if (r != NULL) /* Mutual recursion */
640 this_recurse.prev = recurses;
641 this_recurse.group = cs;
642 prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
643 countptr, backref_cache);
644 if (prev_recurse_d < 0) return prev_recurse_d;
645 prev_recurse_recno = recno;
646 branchlength += prev_recurse_d;
650 cc += 1 + LINK_SIZE + once_fudge;
654 /* Anything else does not or need not match a character. We can get the
655 item's length from the table, but for those that can match zero occurrences
656 of a character, we must take special action for UTF-8 characters. As it
657 happens, the "NOT" versions of these opcodes are used at present only for
658 ASCII characters, so they could be omitted from this list. However, in
659 future that may change, so we include them here so as not to leave a
660 gotcha for a future maintainer. */
695 case OP_NOTMINQUERYI:
699 case OP_NOTPOSQUERYI:
701 cc += PRIV(OP_lengths)[op];
702 #ifdef SUPPORT_UNICODE
703 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
707 /* Skip these, but we need to add in the name length. */
714 cc += PRIV(OP_lengths)[op] + cc[1];
717 /* The remaining opcodes are just skipped over. */
726 cc += PRIV(OP_lengths)[op];
729 /* This should not occur: we list all opcodes explicitly so that when
730 new ones get added they are properly considered. */
736 /* Control never gets here */
741 /*************************************************
742 * Set a bit and maybe its alternate case *
743 *************************************************/
745 /* Given a character, set its first code unit's bit in the table, and also the
746 corresponding bit for the other version of a letter if we are caseless.
749 re points to the regex block
750 p points to the first code unit of the character
751 caseless TRUE if caseless
752 utf TRUE for UTF mode
754 Returns: pointer after the character
758 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
760 uint32_t c = *p++; /* First code unit */
761 (void)utf; /* Stop compiler warning when UTF not supported */
763 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
766 #if PCRE2_CODE_UNIT_WIDTH != 8
767 if (c > 0xff) SET_BIT(0xff); else
772 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
773 the end of the character, even when caseless. */
775 #ifdef SUPPORT_UNICODE
778 #if PCRE2_CODE_UNIT_WIDTH == 8
779 if (c >= 0xc0) GETUTF8INC(c, p);
780 #elif PCRE2_CODE_UNIT_WIDTH == 16
781 if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
784 #endif /* SUPPORT_UNICODE */
786 /* If caseless, handle the other case of the character. */
790 #ifdef SUPPORT_UNICODE
793 #if PCRE2_CODE_UNIT_WIDTH == 8
795 c = UCD_OTHERCASE(c);
796 (void)PRIV(ord2utf)(c, buff);
798 #else /* 16-bit or 32-bit mode */
799 c = UCD_OTHERCASE(c);
800 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
804 #endif /* SUPPORT_UNICODE */
808 if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
816 /*************************************************
817 * Set bits for a positive character type *
818 *************************************************/
820 /* This function sets starting bits for a character type. In UTF-8 mode, we can
821 only do a direct setting for bytes less than 128, as otherwise there can be
822 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
823 environment, the tables will only recognize ASCII characters anyway, but in at
824 least one Windows environment, some higher bytes bits were set in the tables.
825 So we deal with that case by considering the UTF-8 encoding.
829 cbit type the type of character wanted
830 table_limit 32 for non-UTF-8; 16 for UTF-8
836 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
839 for (c = 0; c < table_limit; c++)
840 re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
841 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
842 if (table_limit == 32) return;
843 for (c = 128; c < 256; c++)
845 if ((re->tables[cbits_offset + c/8] & (1 << (c&7))) != 0)
848 (void)PRIV(ord2utf)(c, buff);
856 /*************************************************
857 * Set bits for a negative character type *
858 *************************************************/
860 /* This function sets starting bits for a negative character type such as \D.
861 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
862 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
863 Unlike in the positive case, where we can set appropriate starting bits for
864 specific high-valued UTF-8 characters, in this case we have to set the bits for
865 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
866 0xc0 (192) for simplicity.
870 cbit type the type of character wanted
871 table_limit 32 for non-UTF-8; 16 for UTF-8
877 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
880 for (c = 0; c < table_limit; c++)
881 re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
882 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
883 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
889 /*************************************************
890 * Create bitmap of starting bytes *
891 *************************************************/
893 /* This function scans a compiled unanchored expression recursively and
894 attempts to build a bitmap of the set of possible starting code units whose
895 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
896 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
897 we pass a value of 16 rather than 32 as the final argument. (See comments in
898 those functions for the reason.)
900 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
901 (a*)b where the group provides some optional starting code units but scanning
902 must continue at the outer level to find at least one mandatory code unit. At
903 the outermost level, this function fails unless the result is SSB_DONE.
906 re points to the compiled regex block
907 code points to an expression
908 utf TRUE if in UTF mode
910 Returns: SSB_FAIL => Failed to find any starting code units
911 SSB_DONE => Found mandatory starting code units
912 SSB_CONTINUE => Found optional starting code units
913 SSB_UNKNOWN => Hit an unrecognized opcode
917 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
920 int yield = SSB_DONE;
922 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
923 int table_limit = utf? 16:32;
925 int table_limit = 32;
930 BOOL try_next = TRUE;
931 PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
933 if (*code == OP_CBRA || *code == OP_SCBRA ||
934 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
936 while (try_next) /* Loop for items in this branch */
939 uint8_t *classmap = NULL;
943 /* If we reach something we don't understand, it means a new opcode has
944 been created that hasn't been added to this function. Hopefully this
945 problem will be discovered during testing. */
950 /* Fail for a valid opcode that implies no starting bits. */
953 case OP_ASSERT_ACCEPT:
984 case OP_NOTMINQUERYI:
994 case OP_NOTPOSQUERYI:
1025 /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1026 uses OP_CIRCM). Skip over it. */
1029 tcode += PRIV(OP_lengths)[OP_CIRC];
1032 /* A "real" property test implies no starting bits, but the fake property
1033 PT_CLIST identifies a list of characters. These lists are short, as they
1034 are used for characters with more than one "other case", so there is no
1035 point in recognizing them for OP_NOTPROP. */
1038 if (tcode[1] != PT_CLIST) return SSB_FAIL;
1040 const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1041 while ((c = *p++) < NOTACHAR)
1043 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1046 PCRE2_UCHAR buff[6];
1047 (void)PRIV(ord2utf)(c, buff);
1051 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1057 /* We can ignore word boundary tests. */
1059 case OP_WORD_BOUNDARY:
1060 case OP_NOT_WORD_BOUNDARY:
1064 /* If we hit a bracket or a positive lookahead assertion, recurse to set
1065 bits from within the subpattern. If it can't find anything, we have to
1066 give up. If it finds some mandatory character(s), we are done for this
1067 branch. Otherwise, carry on scanning after the subpattern. */
1079 rc = set_start_bits(re, tcode, utf);
1080 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1081 if (rc == SSB_DONE) try_next = FALSE; else
1083 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1084 tcode += 1 + LINK_SIZE;
1088 /* If we hit ALT or KET, it means we haven't found anything mandatory in
1089 this branch, though we might have found something optional. For ALT, we
1090 continue with the next alternative, but we have to arrange that the final
1091 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1092 return SSB_CONTINUE: if this is the top level, that indicates failure,
1093 but after a nested subpattern, it causes scanning to continue. */
1096 yield = SSB_CONTINUE;
1104 return SSB_CONTINUE;
1106 /* Skip over callout */
1109 tcode += PRIV(OP_lengths)[OP_CALLOUT];
1112 case OP_CALLOUT_STR:
1113 tcode += GET(tcode, 1 + 2*LINK_SIZE);
1116 /* Skip over lookbehind and negative lookahead assertions */
1120 case OP_ASSERTBACK_NOT:
1121 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1122 tcode += 1 + LINK_SIZE;
1125 /* BRAZERO does the bracket, but carries on. */
1130 rc = set_start_bits(re, ++tcode, utf);
1131 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1132 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1133 tcode += 1 + LINK_SIZE;
1136 /* SKIPZERO skips the bracket. */
1140 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1141 tcode += 1 + LINK_SIZE;
1144 /* Single-char * or ? sets the bit and tries the next item */
1152 tcode = set_table_bit(re, tcode + 1, FALSE, utf);
1161 tcode = set_table_bit(re, tcode + 1, TRUE, utf);
1164 /* Single-char upto sets the bit and tries the next */
1169 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
1175 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
1178 /* At least one single char sets the bit and stops */
1187 (void)set_table_bit(re, tcode + 1, FALSE, utf);
1198 (void)set_table_bit(re, tcode + 1, TRUE, utf);
1202 /* Special spacing and line-terminating items. These recognize specific
1203 lists of characters. The difference between VSPACE and ANYNL is that the
1204 latter can match the two-character CRLF sequence, but that is not
1205 relevant for finding the first character, so their code here is
1210 SET_BIT(CHAR_SPACE);
1212 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1213 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1215 #if PCRE2_CODE_UNIT_WIDTH != 8
1219 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1220 units of horizontal space characters. */
1222 #ifdef SUPPORT_UNICODE
1225 SET_BIT(0xC2); /* For U+00A0 */
1226 SET_BIT(0xE1); /* For U+1680, U+180E */
1227 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1228 SET_BIT(0xE3); /* For U+3000 */
1232 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1233 the code is EBCDIC. */
1237 #endif /* Not EBCDIC */
1239 #endif /* 8-bit support */
1251 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1252 the bits for NEL and for code units >= 255, independently of UTF. */
1254 #if PCRE2_CODE_UNIT_WIDTH != 8
1258 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1259 units of vertical space characters. */
1261 #ifdef SUPPORT_UNICODE
1264 SET_BIT(0xC2); /* For U+0085 (NEL) */
1265 SET_BIT(0xE2); /* For U+2028, U+2029 */
1269 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1273 #endif /* 8-bit support */
1278 /* Single character types set the bits and stop. Note that if PCRE2_UCP
1279 is set, we do not see these opcodes because \d etc are converted to
1280 properties. Therefore, these apply in the case when only characters less
1281 than 256 are recognized to match the types. */
1284 set_nottype_bits(re, cbit_digit, table_limit);
1289 set_type_bits(re, cbit_digit, table_limit);
1293 case OP_NOT_WHITESPACE:
1294 set_nottype_bits(re, cbit_space, table_limit);
1299 set_type_bits(re, cbit_space, table_limit);
1303 case OP_NOT_WORDCHAR:
1304 set_nottype_bits(re, cbit_word, table_limit);
1309 set_type_bits(re, cbit_word, table_limit);
1313 /* One or more character type fudges the pointer and restarts, knowing
1314 it will hit a single character type and stop there. */
1317 case OP_TYPEMINPLUS:
1318 case OP_TYPEPOSPLUS:
1323 tcode += 1 + IMM2_SIZE;
1326 /* Zero or more repeats of character types set the bits and then
1330 case OP_TYPEMINUPTO:
1331 case OP_TYPEPOSUPTO:
1332 tcode += IMM2_SIZE; /* Fall through */
1335 case OP_TYPEMINSTAR:
1336 case OP_TYPEPOSSTAR:
1338 case OP_TYPEMINQUERY:
1339 case OP_TYPEPOSQUERY:
1349 SET_BIT(CHAR_SPACE);
1351 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1352 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1354 #if PCRE2_CODE_UNIT_WIDTH != 8
1358 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1359 units of horizontal space characters. */
1361 #ifdef SUPPORT_UNICODE
1364 SET_BIT(0xC2); /* For U+00A0 */
1365 SET_BIT(0xE1); /* For U+1680, U+180E */
1366 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1367 SET_BIT(0xE3); /* For U+3000 */
1371 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1372 the code is EBCDIC. */
1376 #endif /* Not EBCDIC */
1378 #endif /* 8-bit support */
1388 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1389 the bits for NEL and for code units >= 255, independently of UTF. */
1391 #if PCRE2_CODE_UNIT_WIDTH != 8
1395 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1396 units of vertical space characters. */
1398 #ifdef SUPPORT_UNICODE
1401 SET_BIT(0xC2); /* For U+0085 (NEL) */
1402 SET_BIT(0xE2); /* For U+2028, U+2029 */
1406 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1410 #endif /* 8-bit support */
1414 set_nottype_bits(re, cbit_digit, table_limit);
1418 set_type_bits(re, cbit_digit, table_limit);
1421 case OP_NOT_WHITESPACE:
1422 set_nottype_bits(re, cbit_space, table_limit);
1426 set_type_bits(re, cbit_space, table_limit);
1429 case OP_NOT_WORDCHAR:
1430 set_nottype_bits(re, cbit_word, table_limit);
1434 set_type_bits(re, cbit_word, table_limit);
1441 /* Extended class: if there are any property checks, or if this is a
1442 negative XCLASS without a map, give up. If there are no property checks,
1443 there must be wide characters on the XCLASS list, because otherwise an
1444 XCLASS would not have been created. This means that code points >= 255
1445 are always potential starters. */
1447 #ifdef SUPPORT_WIDE_CHARS
1449 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
1450 (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1453 /* We have a positive XCLASS or a negative one without a map. Set up the
1454 map pointer if there is one, and fall through. */
1456 classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
1457 (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1459 /* It seems that the fall through comment must be outside the #ifdef if
1460 it is to avoid the gcc compiler warning. */
1464 /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1465 in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1466 because it starts a character with a value > 255. In 8-bit non-UTF mode,
1467 there is no difference between CLASS and NCLASS. In all other wide
1468 character modes, set the 0xFF bit to indicate code units >= 255. */
1471 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1474 re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1475 memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1477 #elif PCRE2_CODE_UNIT_WIDTH != 8
1478 SET_BIT(0xFF); /* For characters >= 255 */
1482 /* Enter here for a positive non-XCLASS. If we have fallen through from
1483 an XCLASS, classmap will already be set; just advance the code pointer.
1484 Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1487 if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1489 classmap = (uint8_t *)(++tcode);
1490 tcode += 32 / sizeof(PCRE2_UCHAR);
1493 /* When wide characters are supported, classmap may be NULL. In UTF-8
1494 (sic) mode, the bits in a class bit map correspond to character values,
1495 not to byte values. However, the bit map we are constructing is for byte
1496 values. So we have to do a conversion for characters whose code point is
1497 greater than 127. In fact, there are only two possible starting bytes for
1498 characters in the range 128 - 255. */
1500 if (classmap != NULL)
1502 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1505 for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1506 for (c = 128; c < 256; c++)
1508 if ((classmap[c/8] & (1 << (c&7))) != 0)
1510 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1511 re->start_bitmap[d/8] |= (1 << (d&7)); /* and then skip on to the */
1512 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1518 /* In all modes except UTF-8, the two bit maps are compatible. */
1521 for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1525 /* Act on what follows the class. For a zero minimum repeat, continue;
1526 otherwise stop processing. */
1542 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1543 else try_next = FALSE;
1550 break; /* End of class handling case */
1551 } /* End of switch for opcodes */
1552 } /* End of try_next loop */
1554 code += GET(code, 1); /* Advance to next branch */
1556 while (*code == OP_ALT);
1563 /*************************************************
1564 * Study a compiled expression *
1565 *************************************************/
1567 /* This function is handed a compiled expression that it must study to produce
1568 information that will speed up the matching.
1570 Argument: points to the compiled expression
1571 Returns: 0 normally; non-zero should never normally occur
1572 1 unknown opcode in set_start_bits
1573 2 missing capturing bracket
1574 3 unknown opcode in find_minlength
1578 PRIV(study)(pcre2_real_code *re)
1583 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1585 /* Find start of compiled code */
1587 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1588 re->name_entry_size * re->name_count;
1590 /* For a pattern that has a first code unit, or a multiline pattern that
1591 matches only at "line start", there is no point in seeking a list of starting
1594 if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1596 int rc = set_start_bits(re, code, utf);
1597 if (rc == SSB_UNKNOWN) return 1;
1598 if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
1601 /* Find the minimum length of subject string. If the pattern can match an empty
1602 string, the minimum length is already known. If there are more back references
1603 than the size of the vector we are going to cache them in, do nothing. A
1604 pattern that complicated will probably take a long time to analyze and may in
1605 any case turn out to be too complicated. Note that back reference minima are
1606 held as 16-bit numbers. */
1608 if ((re->flags & PCRE2_MATCH_EMPTY) == 0 &&
1609 re->top_backref <= MAX_CACHE_BACKREF)
1611 int backref_cache[MAX_CACHE_BACKREF+1];
1612 backref_cache[0] = 0; /* Highest one that is set */
1613 min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
1616 case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
1617 break; /* Leave minlength unchanged (will be zero) */
1620 return 2; /* missing capturing bracket */
1623 return 3; /* unrecognized opcode */
1626 if (min > UINT16_MAX) min = UINT16_MAX;
1627 re->minlength = min;
1635 /* End of pcre2_study.c */