1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2015-2018 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
46 /* These defines enable debugging code */
48 /* #define DEBUG_FRAMES_DISPLAY */
49 /* #define DEBUG_SHOW_OPS */
50 /* #define DEBUG_SHOW_RMATCH */
52 #ifdef DEBUG_FRAME_DISPLAY
56 /* These defines identify the name of the block containing "static"
57 information, and fields within it. */
59 #define NLBLOCK mb /* Block containing newline information */
60 #define PSSTART start_subject /* Field containing processed string start */
61 #define PSEND end_subject /* Field containing processed string end */
63 #include "pcre2_internal.h"
65 #define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */
67 /* Masks for identifying the public options that are permitted at match time. */
69 #define PUBLIC_MATCH_OPTIONS \
70 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
71 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
72 PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT)
74 #define PUBLIC_JIT_MATCH_OPTIONS \
75 (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\
76 PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD)
78 /* Non-error returns from and within the match() function. Error returns are
79 externally defined PCRE2_ERROR_xxx codes, which are all negative. */
82 #define MATCH_NOMATCH 0
84 /* Special internal returns used in the match() function. Make them
85 sufficiently negative to avoid the external error codes. */
87 #define MATCH_ACCEPT (-999)
88 #define MATCH_KETRPOS (-998)
89 /* The next 5 must be kept together and in sequence so that a test that checks
90 for any one of them can use a range. */
91 #define MATCH_COMMIT (-997)
92 #define MATCH_PRUNE (-996)
93 #define MATCH_SKIP (-995)
94 #define MATCH_SKIP_ARG (-994)
95 #define MATCH_THEN (-993)
96 #define MATCH_BACKTRACK_MAX MATCH_THEN
97 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
99 /* Group frame type values. Zero means the frame is not a group frame. The
100 lower 16 bits are used for data (e.g. the capture number). Group frames are
101 used for most groups so that information about the start is easily available at
102 the end without having to scan back through intermediate frames (backtrack
105 #define GF_CAPTURE 0x00010000u
106 #define GF_NOCAPTURE 0x00020000u
107 #define GF_CONDASSERT 0x00030000u
108 #define GF_RECURSE 0x00040000u
110 /* Masks for the identity and data parts of the group frame type. */
112 #define GF_IDMASK(a) ((a) & 0xffff0000u)
113 #define GF_DATAMASK(a) ((a) & 0x0000ffffu)
115 /* Repetition types */
117 enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS };
119 /* Min and max values for the common repeats; a maximum of UINT32_MAX =>
122 static const uint32_t rep_min[] = {
126 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
127 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
129 static const uint32_t rep_max[] = {
130 UINT32_MAX, UINT32_MAX, /* * and *? */
131 UINT32_MAX, UINT32_MAX, /* + and +? */
133 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
134 UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
136 /* Repetition types - must include OP_CRPOSRANGE (not needed above) */
138 static const uint32_t rep_typ[] = {
139 REPTYPE_MAX, REPTYPE_MIN, /* * and *? */
140 REPTYPE_MAX, REPTYPE_MIN, /* + and +? */
141 REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */
142 REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */
143 REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */
144 REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */
146 /* Numbers for RMATCH calls at backtracking points. When these lists are
147 changed, the code at RETURN_SWITCH below must be updated in sync. */
149 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
150 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
151 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
152 RM31, RM32, RM33, RM34, RM35, RM36 };
154 #ifdef SUPPORT_WIDE_CHARS
155 enum { RM100=100, RM101 };
158 #ifdef SUPPORT_UNICODE
159 enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207,
160 RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215,
161 RM216, RM217, RM218, RM219, RM220, RM221, RM222 };
164 /* Define short names for general fields in the current backtrack frame, which
165 is always pointed to by the F variable. Occasional references to fields in
166 other frames are written out explicitly. There are also some fields in the
167 current frame whose names start with "temp" that are used for short-term,
168 localised backtracking memory. These are #defined with Lxxx names at the point
169 of use and undefined afterwards. */
171 #define Fback_frame F->back_frame
172 #define Fcapture_last F->capture_last
173 #define Fcurrent_recurse F->current_recurse
174 #define Fecode F->ecode
175 #define Feptr F->eptr
176 #define Fgroup_frame_type F->group_frame_type
177 #define Flast_group_offset F->last_group_offset
178 #define Flength F->length
179 #define Fmark F->mark
180 #define Frdepth F->rdepth
181 #define Fstart_match F->start_match
182 #define Foffset_top F->offset_top
183 #define Foccu F->occu
185 #define Fovector F->ovector
186 #define Freturn_id F->return_id
189 #ifdef DEBUG_FRAMES_DISPLAY
190 /*************************************************
191 * Display current frames and contents *
192 *************************************************/
194 /* This debugging function displays the current set of frames and their
195 contents. It is not called automatically from anywhere, the intention being
196 that calls can be inserted where necessary when debugging frame-related
200 f the file to write to
201 F the current top frame
202 P a previous frame of interest
203 frame_size the frame size
204 mb points to the match block
205 s identification text
211 display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,
212 match_block *mb, const char *s, ...)
219 fprintf(f, "FRAMES ");
223 if (P != NULL) fprintf(f, " P=%lu",
224 ((char *)P - (char *)(mb->match_frames))/frame_size);
227 for (i = 0, Q = mb->match_frames;
229 i++, Q = (heapframe *)((char *)Q + frame_size))
231 fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d",
232 i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode),
233 Q->back_frame, Q->return_id);
235 if (Q->last_group_offset == PCRE2_UNSET)
236 fprintf(f, " lgoffset=unset\n");
238 fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size);
246 /*************************************************
247 * Process a callout *
248 *************************************************/
250 /* This function is called for all callouts, whether "standalone" or at the
251 start of a conditional group. Feptr will be pointing to either OP_CALLOUT or
252 OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized
256 F points to the current backtracking frame
257 mb points to the match block
258 lengthptr where to return the length of the callout item
260 Returns: the return from the callout
261 or 0 if no callout function exists
265 do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr)
268 PCRE2_SIZE save0, save1;
269 PCRE2_SIZE *callout_ovector;
270 pcre2_callout_block *cb;
272 *lengthptr = (*Fecode == OP_CALLOUT)?
273 PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
275 if (mb->callout == NULL) return 0; /* No callout function provided */
277 /* The original matching code (pre 10.30) worked directly with the ovector
278 passed by the user, and this was passed to callouts. Now that the working
279 ovector is in the backtracking frame, it no longer needs to reserve space for
280 the overall match offsets (which would waste space in the frame). For backward
281 compatibility, however, we pass capture_top and offset_vector to the callout as
282 if for the extended ovector, and we ensure that the first two slots are unset
283 by preserving and restoring their current contents. Picky compilers complain if
284 references such as Fovector[-2] are use directly, so we set up a separate
287 callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
289 /* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields
290 are set externally. The first 3 never change; the last is updated for each
294 cb->capture_top = (uint32_t)Foffset_top/2 + 1;
295 cb->capture_last = Fcapture_last;
296 cb->offset_vector = callout_ovector;
297 cb->mark = mb->nomatch_mark;
298 cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);
299 cb->pattern_position = GET(Fecode, 1);
300 cb->next_item_length = GET(Fecode, 1 + LINK_SIZE);
302 if (*Fecode == OP_CALLOUT) /* Numerical callout */
304 cb->callout_number = Fecode[1 + 2*LINK_SIZE];
305 cb->callout_string_offset = 0;
306 cb->callout_string = NULL;
307 cb->callout_string_length = 0;
309 else /* String callout */
311 cb->callout_number = 0;
312 cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);
313 cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;
314 cb->callout_string_length =
315 *lengthptr - (1 + 4*LINK_SIZE) - 2;
318 save0 = callout_ovector[0];
319 save1 = callout_ovector[1];
320 callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET;
321 rc = mb->callout(cb, mb->callout_data);
322 callout_ovector[0] = save0;
323 callout_ovector[1] = save1;
324 cb->callout_flags = 0;
330 /*************************************************
331 * Match a back-reference *
332 *************************************************/
334 /* This function is called only when it is known that the offset lies within
335 the offsets that have so far been used in the match. Note that in caseless
336 UTF-8 mode, the number of subject bytes matched may be different to the number
337 of reference bytes. (In theory this could also happen in UTF-16 mode, but it
341 offset index into the offset vector
342 caseless TRUE if caseless
343 F the current backtracking frame pointer
344 mb points to match block
345 lengthptr pointer for returning the length matched
347 Returns: = 0 sucessful match; number of code units matched is set
353 match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb,
354 PCRE2_SIZE *lengthptr)
359 PCRE2_SPTR eptr_start;
361 /* Deal with an unset group. The default is no match, but there is an option to
362 match an empty string. */
364 if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET)
366 if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
369 return 0; /* Match */
371 else return -1; /* No match */
374 /* Separate the caseless and UTF cases for speed. */
376 eptr = eptr_start = Feptr;
377 p = mb->start_subject + Fovector[offset];
378 length = Fovector[offset+1] - Fovector[offset];
382 #if defined SUPPORT_UNICODE
383 if ((mb->poptions & PCRE2_UTF) != 0)
385 /* Match characters up to the end of the reference. NOTE: the number of
386 code units matched may differ, because in UTF-8 there are some characters
387 whose upper and lower case codes have different numbers of bytes. For
388 example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3
389 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
390 sequence of two of the latter. It is important, therefore, to check the
391 length along the reference, not along the subject (earlier code did this
394 PCRE2_SPTR endptr = p + length;
398 const ucd_record *ur;
399 if (eptr >= mb->end_subject) return 1; /* Partial match */
403 if (c != d && c != (uint32_t)((int)d + ur->other_case))
405 const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
408 if (c < *pp) return -1; /* No match */
409 if (c == *pp++) break;
417 /* Not in UTF mode */
420 for (; length > 0; length--)
423 if (eptr >= mb->end_subject) return 1; /* Partial match */
424 cc = UCHAR21TEST(eptr);
426 if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))
427 return -1; /* No match */
434 /* In the caseful case, we can just compare the code units, whether or not we
435 are in UTF mode. When partial matching, we have to do this unit-by-unit. */
439 if (mb->partial != 0)
441 for (; length > 0; length--)
443 if (eptr >= mb->end_subject) return 1; /* Partial match */
444 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */
448 /* Not partial matching */
452 if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */
453 if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */
458 *lengthptr = eptr - eptr_start;
459 return 0; /* Match */
464 /******************************************************************************
465 *******************************************************************************
466 "Recursion" in the match() function
468 The original match() function was highly recursive, but this proved to be the
469 source of a number of problems over the years, mostly because of the relatively
470 small system stacks that are commonly found. As new features were added to
471 patterns, various kludges were invented to reduce the amount of stack used,
472 making the code hard to understand in places.
474 A version did exist that used individual frames on the heap instead of calling
475 match() recursively, but this ran substantially slower. The current version is
476 a refactoring that uses a vector of frames to remember backtracking points.
477 This runs no slower, and possibly even a bit faster than the original recursive
478 implementation. An initial vector of size START_FRAMES_SIZE (enough for maybe
479 50 frames) is allocated on the system stack. If this is not big enough, the
480 heap is used for a larger vector.
482 *******************************************************************************
483 ******************************************************************************/
488 /*************************************************
489 * Macros for the match() function *
490 *************************************************/
492 /* These macros pack up tests that are used for partial matching several times
493 in the code. We set the "hit end" flag if the pointer is at the end of the
494 subject and also past the earliest inspected character (i.e. something has been
495 matched, even if not part of the actual matched string). For hard partial
496 matching, we then return immediately. The second one is used when we already
497 know we are past the end of the subject. */
499 #define CHECK_PARTIAL()\
500 if (mb->partial != 0 && Feptr >= mb->end_subject && \
501 Feptr > mb->start_used_ptr) \
504 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
507 #define SCHECK_PARTIAL()\
508 if (mb->partial != 0 && Feptr > mb->start_used_ptr) \
511 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
514 /* These macros are used to implement backtracking. They simulate a recursive
515 call to the match() function by means of a local vector of frames which
516 remember the backtracking points. */
518 #define RMATCH(ra,rb)\
534 /*************************************************
535 * Match from current position *
536 *************************************************/
538 /* This function is called to run one match attempt at a single starting point
541 Performance note: It might be tempting to extract commonly used fields from the
542 mb structure (e.g. end_subject) into individual variables to improve
543 performance. Tests using gcc on a SPARC disproved this; in the first case, it
544 made performance worse.
547 start_eptr starting character in subject
548 start_ecode starting position in compiled code
549 ovector pointer to the final output vector
550 oveccount number of pairs in ovector
551 top_bracket number of capturing parentheses in the pattern
552 frame_size size of each backtracking frame
553 mb pointer to "static" variables block
555 Returns: MATCH_MATCH if matched ) these values are >= 0
556 MATCH_NOMATCH if failed to match )
557 negative MATCH_xxx value for PRUNE, SKIP, etc
558 negative PCRE2_ERROR_xxx value if aborted by an error condition
559 (e.g. stopped by repeated call or depth limit)
563 match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector,
564 uint16_t oveccount, uint16_t top_bracket, PCRE2_SIZE frame_size,
567 /* Frame-handling variables */
569 heapframe *F; /* Current frame pointer */
570 heapframe *N = NULL; /* Temporary frame pointers */
572 heapframe *assert_accept_frame; /* For passing back the frame with captures */
573 PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
575 /* Local variables that do not need to be preserved over calls to RRMATCH(). */
577 PCRE2_SPTR bracode; /* Temp pointer to start of group */
578 PCRE2_SIZE offset; /* Used for group offsets */
579 PCRE2_SIZE length; /* Used for various length calculations */
581 int rrc; /* Return from functions & backtracking "recursions" */
582 #ifdef SUPPORT_UNICODE
583 int proptype; /* Type of character property */
586 uint32_t i; /* Used for local loops */
587 uint32_t fc; /* Character values */
588 uint32_t number; /* Used for group and other numbers */
589 uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */
590 uint32_t group_frame_type; /* Specifies type for new group frames */
592 BOOL condition; /* Used in conditional groups */
593 BOOL cur_is_word; /* Used in "word" tests */
594 BOOL prev_is_word; /* Used in "word" tests */
598 #ifdef SUPPORT_UNICODE
599 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
604 /* This is the length of the last part of a backtracking frame that must be
605 copied when a new frame is created. */
607 frame_copy_size = frame_size - offsetof(heapframe, eptr);
609 /* Set up the first current frame at the start of the vector, and initialize
610 fields that are not reset for new frames. */
612 F = mb->match_frames;
613 Frdepth = 0; /* "Recursion" depth */
614 Fcapture_last = 0; /* Number of most recent capture */
615 Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */
616 Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */
617 Fmark = NULL; /* Most recent mark */
618 Foffset_top = 0; /* End of captures within the frame */
619 Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */
620 group_frame_type = 0; /* Not a start of group frame */
621 goto NEW_FRAME; /* Start processing with this frame */
623 /* Come back here when we want to create a new frame for remembering a
624 backtracking point. */
628 /* Set up a new backtracking frame. If the vector is full, get a new one
629 on the heap, doubling the size, but constrained by the heap limit. */
631 N = (heapframe *)((char *)F + frame_size);
632 if (N >= mb->match_frames_top)
634 PCRE2_SIZE newsize = mb->frame_vector_size * 2;
637 if ((newsize / 1024) > mb->heap_limit)
639 PCRE2_SIZE maxsize = ((mb->heap_limit * 1024)/frame_size) * frame_size;
640 if (mb->frame_vector_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT;
644 new = mb->memctl.malloc(newsize, mb->memctl.memory_data);
645 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
646 memcpy(new, mb->match_frames, mb->frame_vector_size);
648 F = (heapframe *)((char *)new + ((char *)F - (char *)mb->match_frames));
649 N = (heapframe *)((char *)F + frame_size);
651 if (mb->match_frames != mb->stack_frames)
652 mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
653 mb->match_frames = new;
654 mb->match_frames_top = (heapframe *)((char *)mb->match_frames + newsize);
655 mb->frame_vector_size = newsize;
658 #ifdef DEBUG_SHOW_RMATCH
659 fprintf(stderr, "++ RMATCH %2d frame=%d", Freturn_id, Frdepth + 1);
660 if (group_frame_type != 0)
662 fprintf(stderr, " type=%x ", group_frame_type);
663 switch (GF_IDMASK(group_frame_type))
666 fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type));
670 fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type));
674 fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type));
678 fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type));
682 fprintf(stderr, "*** unknown ***");
686 fprintf(stderr, "\n");
689 /* Copy those fields that must be copied into the new frame, increase the
690 "recursion" depth (i.e. the new frame's index) and then make the new frame
693 memcpy((char *)N + offsetof(heapframe, eptr),
694 (char *)F + offsetof(heapframe, eptr),
697 N->rdepth = Frdepth + 1;
700 /* Carry on processing with a new frame. */
703 Fgroup_frame_type = group_frame_type;
704 Fecode = start_ecode; /* Starting code pointer */
705 Fback_frame = frame_size; /* Default is go back one frame */
707 /* If this is a special type of group frame, remember its offset for quick
708 access at the end of the group. If this is a recursion, set a new current
711 if (group_frame_type != 0)
713 Flast_group_offset = (char *)F - (char *)mb->match_frames;
714 if (GF_IDMASK(group_frame_type) == GF_RECURSE)
715 Fcurrent_recurse = GF_DATAMASK(group_frame_type);
716 group_frame_type = 0;
720 /* ========================================================================= */
721 /* This is the main processing loop. First check that we haven't recorded too
722 many backtracks (search tree is too large), or that we haven't exceeded the
723 recursive depth limit (used too many backtracking frames). If not, process the
726 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
727 if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
731 #ifdef DEBUG_SHOW_OPS
732 fprintf(stderr, "++ op=%d\n", *Fecode);
735 Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */
738 /* ===================================================================== */
739 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close
740 any currently open capturing brackets. Unlike reaching the end of a group,
741 where we know the starting frame is at the top of the chained frames, in
742 this case we have to search back for the relevant frame in case other types
743 of group that use chained frames have intervened. Multiple OP_CLOSEs always
744 come innermost first, which matches the chain order. We can ignore this in
745 a recursion, because captures are not passed out of recursions. */
748 if (Fcurrent_recurse == RECURSE_UNSET)
750 number = GET2(Fecode, 1);
751 offset = Flast_group_offset;
754 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
755 N = (heapframe *)((char *)mb->match_frames + offset);
756 P = (heapframe *)((char *)N - frame_size);
757 if (N->group_frame_type == (GF_CAPTURE | number)) break;
758 offset = P->last_group_offset;
760 offset = (number << 1) - 2;
761 Fcapture_last = number;
762 Fovector[offset] = P->eptr - mb->start_subject;
763 Fovector[offset+1] = Feptr - mb->start_subject;
764 if (offset >= Foffset_top) Foffset_top = offset + 2;
766 Fecode += PRIV(OP_lengths)[*Fecode];
770 /* ===================================================================== */
771 /* Real or forced end of the pattern, assertion, or recursion. In an
772 assertion ACCEPT, update the last used pointer and remember the current
773 frame so that the captures and mark can be fished out of it. */
775 case OP_ASSERT_ACCEPT:
776 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
777 assert_accept_frame = F;
778 RRETURN(MATCH_ACCEPT);
780 /* If recursing, we have to find the most recent recursion. */
785 /* Handle end of a recursion. */
787 if (Fcurrent_recurse != RECURSE_UNSET)
789 offset = Flast_group_offset;
792 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
793 N = (heapframe *)((char *)mb->match_frames + offset);
794 P = (heapframe *)((char *)N - frame_size);
795 if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;
796 offset = P->last_group_offset;
799 /* N is now the frame of the recursion; the previous frame is at the
800 OP_RECURSE position. Go back there, copying the current subject position
801 and mark, and move on past the OP_RECURSE. */
806 Fecode += 1 + LINK_SIZE;
810 /* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY
811 is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the
812 start of the subject. In both cases, backtracking will then try other
813 alternatives, if any. */
815 if (Feptr == Fstart_match &&
816 ((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
817 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
818 Fstart_match == mb->start_subject + mb->start_offset)))
819 RRETURN(MATCH_NOMATCH);
821 /* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not
822 the end of the subject. After (*ACCEPT) we fail the entire match (at this
823 position) but backtrack on reaching the end of the pattern. */
825 if (Feptr < mb->end_subject &&
826 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)
828 if (Fop == OP_END) RRETURN(MATCH_NOMATCH);
829 return MATCH_NOMATCH;
832 /* We have a successful match of the whole pattern. Record the result and
833 then do a direct return from the function. If there is space in the offset
834 vector, set any pairs that follow the highest-numbered captured string but
835 are less than the number of capturing groups in the pattern to PCRE2_UNSET.
836 It is documented that this happens. "Gaps" are set to PCRE2_UNSET
837 dynamically. It is only those at the end that need setting here. */
839 mb->end_match_ptr = Feptr; /* Record where we ended */
840 mb->end_offset_top = Foffset_top; /* and how many extracts were taken */
841 mb->mark = Fmark; /* and the last success mark */
842 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
844 ovector[0] = Fstart_match - mb->start_subject;
845 ovector[1] = Feptr - mb->start_subject;
847 /* Set i to the smaller of the sizes of the external and frame ovectors. */
849 i = 2 * ((top_bracket + 1 > oveccount)? oveccount : top_bracket + 1);
850 memcpy(ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
851 while (--i >= Foffset_top + 2) ovector[i] = PCRE2_UNSET;
852 return MATCH_MATCH; /* Note: NOT RRETURN */
855 /*===================================================================== */
856 /* Match any single character type except newline; have to take care with
857 CRLF newlines and partial matching. */
860 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
861 if (mb->partial != 0 &&
862 Feptr == mb->end_subject - 1 &&
863 NLBLOCK->nltype == NLTYPE_FIXED &&
864 NLBLOCK->nllen == 2 &&
865 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
868 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
872 /* Match any single character whatsoever. */
875 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
876 { /* not be updated before SCHECK_PARTIAL. */
878 RRETURN(MATCH_NOMATCH);
881 #ifdef SUPPORT_UNICODE
882 if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
888 /* ===================================================================== */
889 /* Match a single code unit, even in UTF mode. This opcode really does
890 match any code unit, even newline. (It really should be called ANYCODEUNIT,
891 of course - the byte name is from pre-16 bit days.) */
894 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
895 { /* not be updated before SCHECK_PARTIAL. */
897 RRETURN(MATCH_NOMATCH);
904 /* ===================================================================== */
905 /* Match a single character, casefully */
908 #ifdef SUPPORT_UNICODE
913 GETCHARLEN(fc, Fecode, Flength);
914 if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr))
916 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
917 RRETURN(MATCH_NOMATCH);
919 for (; Flength > 0; Flength--)
921 if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH);
928 if (mb->end_subject - Feptr < 1)
930 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
931 RRETURN(MATCH_NOMATCH);
933 if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH);
939 /* ===================================================================== */
940 /* Match a single character, caselessly. If we are at the end of the
941 subject, give up immediately. We get here only when the pattern character
942 has at most one other case. Characters with more than two cases are coded
943 as OP_PROP with the pseudo-property PT_CLIST. */
946 if (Feptr >= mb->end_subject)
949 RRETURN(MATCH_NOMATCH);
952 #ifdef SUPPORT_UNICODE
957 GETCHARLEN(fc, Fecode, Flength);
959 /* If the pattern character's value is < 128, we know that its other case
960 (if any) is also < 128 (and therefore only one code unit long in all
961 code-unit widths), so we can use the fast lookup table. We checked above
962 that there is at least one character left in the subject. */
966 uint32_t cc = UCHAR21(Feptr);
967 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
972 /* Otherwise we must pick up the subject character and use Unicode
973 property support to test its other case. Note that we cannot use the
974 value of "Flength" to check for sufficient bytes left, because the other
975 case of the character may have more or fewer code units. */
980 GETCHARINC(dc, Feptr);
982 if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
986 #endif /* SUPPORT_UNICODE */
988 /* Not UTF mode; use the table for characters < 256. */
990 if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
991 != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
998 /* ===================================================================== */
999 /* Match not a single character. */
1003 if (Feptr >= mb->end_subject)
1006 RRETURN(MATCH_NOMATCH);
1008 #ifdef SUPPORT_UNICODE
1013 GETCHARINC(ch, Fecode);
1014 GETCHARINC(fc, Feptr);
1017 RRETURN(MATCH_NOMATCH); /* Caseful match */
1019 else if (Fop == OP_NOTI) /* If caseless */
1022 ch = UCD_OTHERCASE(ch);
1024 ch = TABLE_GET(ch, mb->fcc, ch);
1025 if (ch == fc) RRETURN(MATCH_NOMATCH);
1029 #endif /* SUPPORT_UNICODE */
1031 uint32_t ch = Fecode[1];
1033 if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
1034 RRETURN(MATCH_NOMATCH);
1040 /* ===================================================================== */
1041 /* Match a single character repeatedly. */
1043 #define Loclength F->temp_size
1044 #define Lstart_eptr F->temp_sptr[0]
1045 #define Lcharptr F->temp_sptr[1]
1046 #define Lmin F->temp_32[0]
1047 #define Lmax F->temp_32[1]
1048 #define Lc F->temp_32[2]
1049 #define Loc F->temp_32[3]
1053 Lmin = Lmax = GET2(Fecode, 1);
1054 Fecode += 1 + IMM2_SIZE;
1059 reptype = REPTYPE_POS;
1061 Lmax = GET2(Fecode, 1);
1062 Fecode += 1 + IMM2_SIZE;
1067 reptype = REPTYPE_MAX;
1069 Lmax = GET2(Fecode, 1);
1070 Fecode += 1 + IMM2_SIZE;
1075 reptype = REPTYPE_MIN;
1077 Lmax = GET2(Fecode, 1);
1078 Fecode += 1 + IMM2_SIZE;
1083 reptype = REPTYPE_POS;
1091 reptype = REPTYPE_POS;
1099 reptype = REPTYPE_POS;
1117 fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI);
1120 reptype = rep_typ[fc];
1122 /* Common code for all repeated single-character matches. We first check
1123 for the minimum number of characters. If the minimum equals the maximum, we
1124 are done. Otherwise, if minimizing, check the rest of the pattern for a
1125 match; if there isn't one, advance up to the maximum, one character at a
1128 If maximizing, advance up to the maximum number of matching characters,
1129 until Feptr is past the end of the maximum run. If possessive, we are
1130 then done (no backing up). Otherwise, match at this position; anything
1131 other than no match is immediately returned. For nomatch, back up one
1132 character, unless we are matching \R and the last thing matched was
1133 \r\n, in which case, back up two code units until we reach the first
1134 optional character position.
1136 The various UTF/non-UTF and caseful/caseless cases are handled separately,
1140 #ifdef SUPPORT_UNICODE
1145 GETCHARLEN(fc, Fecode, Flength);
1148 /* Handle multi-code-unit character matching, caseful and caseless. */
1154 if (Fop >= OP_STARI && /* Caseless */
1155 (othercase = UCD_OTHERCASE(fc)) != fc)
1156 Loclength = PRIV(ord2utf)(othercase, Foccu);
1159 for (i = 1; i <= Lmin; i++)
1161 if (Feptr <= mb->end_subject - Flength &&
1162 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1163 else if (Loclength > 0 &&
1164 Feptr <= mb->end_subject - Loclength &&
1165 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1170 RRETURN(MATCH_NOMATCH);
1174 if (Lmin == Lmax) continue;
1176 if (reptype == REPTYPE_MIN)
1180 RMATCH(Fecode, RM202);
1181 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1182 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1183 if (Feptr <= mb->end_subject - Flength &&
1184 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1185 else if (Loclength > 0 &&
1186 Feptr <= mb->end_subject - Loclength &&
1187 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1192 RRETURN(MATCH_NOMATCH);
1195 /* Control never gets here */
1200 Lstart_eptr = Feptr;
1201 for (i = Lmin; i < Lmax; i++)
1203 if (Feptr <= mb->end_subject - Flength &&
1204 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0)
1206 else if (Loclength > 0 &&
1207 Feptr <= mb->end_subject - Loclength &&
1208 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1217 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1218 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1221 if (reptype != REPTYPE_POS) for(;;)
1223 if (Feptr <= Lstart_eptr) break;
1224 RMATCH(Fecode, RM203);
1225 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1230 break; /* End of repeated wide character handling */
1233 /* Length of UTF character is 1. Put it into the preserved variable and
1234 fall through to the non-UTF code. */
1239 #endif /* SUPPORT_UNICODE */
1241 /* When not in UTF mode, load a single-code-unit character. Then proceed as
1246 /* Caseless comparison */
1248 if (Fop >= OP_STARI)
1250 #if PCRE2_CODE_UNIT_WIDTH == 8
1251 /* Lc must be < 128 in UTF-8 mode. */
1253 #else /* 16-bit & 32-bit */
1254 #ifdef SUPPORT_UNICODE
1255 if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1257 #endif /* SUPPORT_UNICODE */
1258 Loc = TABLE_GET(Lc, mb->fcc, Lc);
1259 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1261 for (i = 1; i <= Lmin; i++)
1263 uint32_t cc; /* Faster than PCRE2_UCHAR */
1264 if (Feptr >= mb->end_subject)
1267 RRETURN(MATCH_NOMATCH);
1269 cc = UCHAR21TEST(Feptr);
1270 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1273 if (Lmin == Lmax) continue;
1275 if (reptype == REPTYPE_MIN)
1279 uint32_t cc; /* Faster than PCRE2_UCHAR */
1280 RMATCH(Fecode, RM25);
1281 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1282 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1283 if (Feptr >= mb->end_subject)
1286 RRETURN(MATCH_NOMATCH);
1288 cc = UCHAR21TEST(Feptr);
1289 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1292 /* Control never gets here */
1297 Lstart_eptr = Feptr;
1298 for (i = Lmin; i < Lmax; i++)
1300 uint32_t cc; /* Faster than PCRE2_UCHAR */
1301 if (Feptr >= mb->end_subject)
1306 cc = UCHAR21TEST(Feptr);
1307 if (Lc != cc && Loc != cc) break;
1310 if (reptype != REPTYPE_POS) for (;;)
1312 if (Feptr == Lstart_eptr) break;
1313 RMATCH(Fecode, RM26);
1315 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1320 /* Caseful comparisons (includes all multi-byte characters) */
1324 for (i = 1; i <= Lmin; i++)
1326 if (Feptr >= mb->end_subject)
1329 RRETURN(MATCH_NOMATCH);
1331 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1334 if (Lmin == Lmax) continue;
1336 if (reptype == REPTYPE_MIN)
1340 RMATCH(Fecode, RM27);
1341 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1342 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1343 if (Feptr >= mb->end_subject)
1346 RRETURN(MATCH_NOMATCH);
1348 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1350 /* Control never gets here */
1354 Lstart_eptr = Feptr;
1355 for (i = Lmin; i < Lmax; i++)
1357 if (Feptr >= mb->end_subject)
1363 if (Lc != UCHAR21TEST(Feptr)) break;
1367 if (reptype != REPTYPE_POS) for (;;)
1369 if (Feptr <= Lstart_eptr) break;
1370 RMATCH(Fecode, RM28);
1372 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1387 /* ===================================================================== */
1388 /* Match a negated single one-byte character repeatedly. This is almost a
1389 repeat of the code for a repeated single character, but I haven't found a
1390 nice way of commoning these up that doesn't require a test of the
1391 positive/negative option for each character match. Maybe that wouldn't add
1392 very much to the time taken, but character matching *is* what this is all
1395 #define Lstart_eptr F->temp_sptr[0]
1396 #define Lmin F->temp_32[0]
1397 #define Lmax F->temp_32[1]
1398 #define Lc F->temp_32[2]
1399 #define Loc F->temp_32[3]
1403 Lmin = Lmax = GET2(Fecode, 1);
1404 Fecode += 1 + IMM2_SIZE;
1410 Lmax = GET2(Fecode, 1);
1411 reptype = REPTYPE_MAX;
1412 Fecode += 1 + IMM2_SIZE;
1416 case OP_NOTMINUPTOI:
1418 Lmax = GET2(Fecode, 1);
1419 reptype = REPTYPE_MIN;
1420 Fecode += 1 + IMM2_SIZE;
1424 case OP_NOTPOSSTARI:
1425 reptype = REPTYPE_POS;
1432 case OP_NOTPOSPLUSI:
1433 reptype = REPTYPE_POS;
1439 case OP_NOTPOSQUERY:
1440 case OP_NOTPOSQUERYI:
1441 reptype = REPTYPE_POS;
1448 case OP_NOTPOSUPTOI:
1449 reptype = REPTYPE_POS;
1451 Lmax = GET2(Fecode, 1);
1452 Fecode += 1 + IMM2_SIZE;
1458 case OP_NOTMINSTARI:
1462 case OP_NOTMINPLUSI:
1465 case OP_NOTMINQUERY:
1466 case OP_NOTMINQUERYI:
1467 fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
1470 reptype = rep_typ[fc];
1472 /* Common code for all repeated single-character non-matches. */
1475 GETCHARINCTEST(Lc, Fecode);
1477 /* The code is duplicated for the caseless and caseful cases, for speed,
1478 since matching characters is likely to be quite common. First, ensure the
1479 minimum number of matches are present. If Lmin = Lmax, we are done.
1480 Otherwise, if minimizing, keep trying the rest of the expression and
1481 advancing one matching character if failing, up to the maximum.
1482 Alternatively, if maximizing, find the maximum number of characters and
1485 if (Fop >= OP_NOTSTARI) /* Caseless */
1487 #ifdef SUPPORT_UNICODE
1488 if (utf && Lc > 127)
1489 Loc = UCD_OTHERCASE(Lc);
1491 #endif /* SUPPORT_UNICODE */
1493 Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */
1495 #ifdef SUPPORT_UNICODE
1499 for (i = 1; i <= Lmin; i++)
1501 if (Feptr >= mb->end_subject)
1504 RRETURN(MATCH_NOMATCH);
1506 GETCHARINC(d, Feptr);
1507 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1511 #endif /* SUPPORT_UNICODE */
1515 for (i = 1; i <= Lmin; i++)
1517 if (Feptr >= mb->end_subject)
1520 RRETURN(MATCH_NOMATCH);
1522 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1527 if (Lmin == Lmax) continue; /* Finished for exact count */
1529 if (reptype == REPTYPE_MIN)
1531 #ifdef SUPPORT_UNICODE
1537 RMATCH(Fecode, RM204);
1538 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1539 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1540 if (Feptr >= mb->end_subject)
1543 RRETURN(MATCH_NOMATCH);
1545 GETCHARINC(d, Feptr);
1546 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1550 #endif /*SUPPORT_UNICODE */
1556 RMATCH(Fecode, RM29);
1557 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1558 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1559 if (Feptr >= mb->end_subject)
1562 RRETURN(MATCH_NOMATCH);
1564 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1568 /* Control never gets here */
1575 Lstart_eptr = Feptr;
1577 #ifdef SUPPORT_UNICODE
1581 for (i = Lmin; i < Lmax; i++)
1584 if (Feptr >= mb->end_subject)
1589 GETCHARLEN(d, Feptr, len);
1590 if (Lc == d || Loc == d) break;
1594 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1595 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1598 if (reptype != REPTYPE_POS) for(;;)
1600 if (Feptr <= Lstart_eptr) break;
1601 RMATCH(Fecode, RM205);
1602 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1608 #endif /* SUPPORT_UNICODE */
1612 for (i = Lmin; i < Lmax; i++)
1614 if (Feptr >= mb->end_subject)
1619 if (Lc == *Feptr || Loc == *Feptr) break;
1622 if (reptype != REPTYPE_POS) for (;;)
1624 if (Feptr == Lstart_eptr) break;
1625 RMATCH(Fecode, RM30);
1626 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1633 /* Caseful comparisons */
1637 #ifdef SUPPORT_UNICODE
1641 for (i = 1; i <= Lmin; i++)
1643 if (Feptr >= mb->end_subject)
1646 RRETURN(MATCH_NOMATCH);
1648 GETCHARINC(d, Feptr);
1649 if (Lc == d) RRETURN(MATCH_NOMATCH);
1656 for (i = 1; i <= Lmin; i++)
1658 if (Feptr >= mb->end_subject)
1661 RRETURN(MATCH_NOMATCH);
1663 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1667 if (Lmin == Lmax) continue;
1669 if (reptype == REPTYPE_MIN)
1671 #ifdef SUPPORT_UNICODE
1677 RMATCH(Fecode, RM206);
1678 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1679 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1680 if (Feptr >= mb->end_subject)
1683 RRETURN(MATCH_NOMATCH);
1685 GETCHARINC(d, Feptr);
1686 if (Lc == d) RRETURN(MATCH_NOMATCH);
1695 RMATCH(Fecode, RM31);
1696 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1697 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1698 if (Feptr >= mb->end_subject)
1701 RRETURN(MATCH_NOMATCH);
1703 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1706 /* Control never gets here */
1713 Lstart_eptr = Feptr;
1715 #ifdef SUPPORT_UNICODE
1719 for (i = Lmin; i < Lmax; i++)
1722 if (Feptr >= mb->end_subject)
1727 GETCHARLEN(d, Feptr, len);
1732 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1733 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1736 if (reptype != REPTYPE_POS) for(;;)
1738 if (Feptr <= Lstart_eptr) break;
1739 RMATCH(Fecode, RM207);
1740 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1749 for (i = Lmin; i < Lmax; i++)
1751 if (Feptr >= mb->end_subject)
1756 if (Lc == *Feptr) break;
1759 if (reptype != REPTYPE_POS) for (;;)
1761 if (Feptr == Lstart_eptr) break;
1762 RMATCH(Fecode, RM32);
1763 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1778 /* ===================================================================== */
1779 /* Match a bit-mapped character class, possibly repeatedly. These opcodes
1780 are used when all the characters in the class have values in the range
1781 0-255, and either the matching is caseful, or the characters are in the
1782 range 0-127 when UTF processing is enabled. The only difference between
1783 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1786 #define Lmin F->temp_32[0]
1787 #define Lmax F->temp_32[1]
1788 #define Lstart_eptr F->temp_sptr[0]
1789 #define Lbyte_map_address F->temp_sptr[1]
1790 #define Lbyte_map ((unsigned char *)Lbyte_map_address)
1795 Lbyte_map_address = Fecode + 1; /* Save for matching */
1796 Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */
1798 /* Look past the end of the item to see if there is repeat information
1799 following. Then obey similar code to character type repeats. */
1812 fc = *Fecode++ - OP_CRSTAR;
1815 reptype = rep_typ[fc];
1821 Lmin = GET2(Fecode, 1);
1822 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
1823 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
1824 reptype = rep_typ[*Fecode - OP_CRSTAR];
1825 Fecode += 1 + 2 * IMM2_SIZE;
1828 default: /* No repeat follows */
1833 /* First, ensure the minimum number of matches are present. */
1835 #ifdef SUPPORT_UNICODE
1838 for (i = 1; i <= Lmin; i++)
1840 if (Feptr >= mb->end_subject)
1843 RRETURN(MATCH_NOMATCH);
1845 GETCHARINC(fc, Feptr);
1848 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1851 if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1858 for (i = 1; i <= Lmin; i++)
1860 if (Feptr >= mb->end_subject)
1863 RRETURN(MATCH_NOMATCH);
1866 #if PCRE2_CODE_UNIT_WIDTH != 8
1869 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1873 if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1877 /* If Lmax == Lmin we are done. Continue with main loop. */
1879 if (Lmin == Lmax) continue;
1881 /* If minimizing, keep testing the rest of the expression and advancing
1882 the pointer while it matches the class. */
1884 if (reptype == REPTYPE_MIN)
1886 #ifdef SUPPORT_UNICODE
1891 RMATCH(Fecode, RM200);
1892 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1893 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1894 if (Feptr >= mb->end_subject)
1897 RRETURN(MATCH_NOMATCH);
1899 GETCHARINC(fc, Feptr);
1902 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1905 if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1914 RMATCH(Fecode, RM23);
1915 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1916 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1917 if (Feptr >= mb->end_subject)
1920 RRETURN(MATCH_NOMATCH);
1923 #if PCRE2_CODE_UNIT_WIDTH != 8
1926 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1930 if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1933 /* Control never gets here */
1936 /* If maximizing, find the longest possible run, then work backwards. */
1940 Lstart_eptr = Feptr;
1942 #ifdef SUPPORT_UNICODE
1945 for (i = Lmin; i < Lmax; i++)
1948 if (Feptr >= mb->end_subject)
1953 GETCHARLEN(fc, Feptr, len);
1956 if (Fop == OP_CLASS) break;
1959 if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) break;
1963 if (reptype == REPTYPE_POS) continue; /* No backtracking */
1965 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1966 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1971 RMATCH(Fecode, RM201);
1972 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1973 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
1981 for (i = Lmin; i < Lmax; i++)
1983 if (Feptr >= mb->end_subject)
1989 #if PCRE2_CODE_UNIT_WIDTH != 8
1992 if (Fop == OP_CLASS) break;
1996 if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) break;
2000 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2002 while (Feptr >= Lstart_eptr)
2004 RMATCH(Fecode, RM24);
2005 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2010 RRETURN(MATCH_NOMATCH);
2013 /* Control never gets here */
2015 #undef Lbyte_map_address
2022 /* ===================================================================== */
2023 /* Match an extended character class. In the 8-bit library, this opcode is
2024 encountered only when UTF-8 mode mode is supported. In the 16-bit and
2025 32-bit libraries, codepoints greater than 255 may be encountered even when
2026 UTF is not supported. */
2028 #define Lstart_eptr F->temp_sptr[0]
2029 #define Lxclass_data F->temp_sptr[1]
2030 #define Lmin F->temp_32[0]
2031 #define Lmax F->temp_32[1]
2033 #ifdef SUPPORT_WIDE_CHARS
2036 Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */
2037 Fecode += GET(Fecode, 1); /* Advance past the item */
2050 fc = *Fecode++ - OP_CRSTAR;
2053 reptype = rep_typ[fc];
2059 Lmin = GET2(Fecode, 1);
2060 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
2061 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
2062 reptype = rep_typ[*Fecode - OP_CRSTAR];
2063 Fecode += 1 + 2 * IMM2_SIZE;
2066 default: /* No repeat follows */
2071 /* First, ensure the minimum number of matches are present. */
2073 for (i = 1; i <= Lmin; i++)
2075 if (Feptr >= mb->end_subject)
2078 RRETURN(MATCH_NOMATCH);
2080 GETCHARINCTEST(fc, Feptr);
2081 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2084 /* If Lmax == Lmin we can just continue with the main loop. */
2086 if (Lmin == Lmax) continue;
2088 /* If minimizing, keep testing the rest of the expression and advancing
2089 the pointer while it matches the class. */
2091 if (reptype == REPTYPE_MIN)
2095 RMATCH(Fecode, RM100);
2096 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2097 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2098 if (Feptr >= mb->end_subject)
2101 RRETURN(MATCH_NOMATCH);
2103 GETCHARINCTEST(fc, Feptr);
2104 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2106 /* Control never gets here */
2109 /* If maximizing, find the longest possible run, then work backwards. */
2113 Lstart_eptr = Feptr;
2114 for (i = Lmin; i < Lmax; i++)
2117 if (Feptr >= mb->end_subject)
2122 #ifdef SUPPORT_UNICODE
2123 GETCHARLENTEST(fc, Feptr, len);
2127 if (!PRIV(xclass)(fc, Lxclass_data, utf)) break;
2131 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2133 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
2134 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2139 RMATCH(Fecode, RM101);
2140 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2141 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2142 #ifdef SUPPORT_UNICODE
2143 if (utf) BACKCHAR(Feptr);
2146 RRETURN(MATCH_NOMATCH);
2149 /* Control never gets here */
2151 #endif /* SUPPORT_WIDE_CHARS: end of XCLASS */
2159 /* ===================================================================== */
2160 /* Match various character types when PCRE2_UCP is not set. These opcodes
2161 are not generated when PCRE2_UCP is set - instead appropriate property
2162 tests are compiled. */
2165 if (Feptr >= mb->end_subject)
2168 RRETURN(MATCH_NOMATCH);
2170 GETCHARINCTEST(fc, Feptr);
2171 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
2172 RRETURN(MATCH_NOMATCH);
2177 if (Feptr >= mb->end_subject)
2180 RRETURN(MATCH_NOMATCH);
2182 GETCHARINCTEST(fc, Feptr);
2183 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
2184 RRETURN(MATCH_NOMATCH);
2188 case OP_NOT_WHITESPACE:
2189 if (Feptr >= mb->end_subject)
2192 RRETURN(MATCH_NOMATCH);
2194 GETCHARINCTEST(fc, Feptr);
2195 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
2196 RRETURN(MATCH_NOMATCH);
2201 if (Feptr >= mb->end_subject)
2204 RRETURN(MATCH_NOMATCH);
2206 GETCHARINCTEST(fc, Feptr);
2207 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
2208 RRETURN(MATCH_NOMATCH);
2212 case OP_NOT_WORDCHAR:
2213 if (Feptr >= mb->end_subject)
2216 RRETURN(MATCH_NOMATCH);
2218 GETCHARINCTEST(fc, Feptr);
2219 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
2220 RRETURN(MATCH_NOMATCH);
2225 if (Feptr >= mb->end_subject)
2228 RRETURN(MATCH_NOMATCH);
2230 GETCHARINCTEST(fc, Feptr);
2231 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
2232 RRETURN(MATCH_NOMATCH);
2237 if (Feptr >= mb->end_subject)
2240 RRETURN(MATCH_NOMATCH);
2242 GETCHARINCTEST(fc, Feptr);
2245 default: RRETURN(MATCH_NOMATCH);
2248 if (Feptr >= mb->end_subject)
2252 else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++;
2264 #endif /* Not EBCDIC */
2265 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2272 if (Feptr >= mb->end_subject)
2275 RRETURN(MATCH_NOMATCH);
2277 GETCHARINCTEST(fc, Feptr);
2280 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2287 if (Feptr >= mb->end_subject)
2290 RRETURN(MATCH_NOMATCH);
2292 GETCHARINCTEST(fc, Feptr);
2295 HSPACE_CASES: break; /* Byte and multibyte cases */
2296 default: RRETURN(MATCH_NOMATCH);
2302 if (Feptr >= mb->end_subject)
2305 RRETURN(MATCH_NOMATCH);
2307 GETCHARINCTEST(fc, Feptr);
2310 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2317 if (Feptr >= mb->end_subject)
2320 RRETURN(MATCH_NOMATCH);
2322 GETCHARINCTEST(fc, Feptr);
2325 VSPACE_CASES: break;
2326 default: RRETURN(MATCH_NOMATCH);
2332 #ifdef SUPPORT_UNICODE
2334 /* ===================================================================== */
2335 /* Check the next character by Unicode property. We will get here only
2336 if the support is in the binary; otherwise a compile-time error occurs. */
2340 if (Feptr >= mb->end_subject)
2343 RRETURN(MATCH_NOMATCH);
2345 GETCHARINCTEST(fc, Feptr);
2348 const ucd_record *prop = GET_UCD(fc);
2353 if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2357 if ((prop->chartype == ucp_Lu ||
2358 prop->chartype == ucp_Ll ||
2359 prop->chartype == ucp_Lt) == (Fop == OP_NOTPROP))
2360 RRETURN(MATCH_NOMATCH);
2364 if ((Fecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (Fop == OP_PROP))
2365 RRETURN(MATCH_NOMATCH);
2369 if ((Fecode[2] != prop->chartype) == (Fop == OP_PROP))
2370 RRETURN(MATCH_NOMATCH);
2374 if ((Fecode[2] != prop->script) == (Fop == OP_PROP))
2375 RRETURN(MATCH_NOMATCH);
2378 /* These are specials */
2381 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2382 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (Fop == OP_NOTPROP))
2383 RRETURN(MATCH_NOMATCH);
2386 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2387 which means that Perl space and POSIX space are now identical. PCRE
2388 was changed at release 8.34. */
2390 case PT_SPACE: /* Perl space */
2391 case PT_PXSPACE: /* POSIX space */
2396 if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2400 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2401 (Fop == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2407 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2408 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2409 fc == CHAR_UNDERSCORE) == (Fop == OP_NOTPROP))
2410 RRETURN(MATCH_NOMATCH);
2414 cp = PRIV(ucd_caseless_sets) + Fecode[2];
2418 { if (Fop == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2420 { if (Fop == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2425 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2426 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2427 fc >= 0xe000) == (Fop == OP_NOTPROP))
2428 RRETURN(MATCH_NOMATCH);
2431 /* This should never occur */
2434 return PCRE2_ERROR_INTERNAL;
2442 /* ===================================================================== */
2443 /* Match an extended Unicode sequence. We will get here only if the support
2444 is in the binary; otherwise a compile-time error occurs. */
2447 if (Feptr >= mb->end_subject)
2450 RRETURN(MATCH_NOMATCH);
2454 GETCHARINCTEST(fc, Feptr);
2455 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf,
2462 #endif /* SUPPORT_UNICODE */
2465 /* ===================================================================== */
2466 /* Match a single character type repeatedly. Note that the property type
2467 does not need to be in a stack frame as it is not used within an RMATCH()
2470 #define Lstart_eptr F->temp_sptr[0]
2471 #define Lmin F->temp_32[0]
2472 #define Lmax F->temp_32[1]
2473 #define Lctype F->temp_32[2]
2474 #define Lpropvalue F->temp_32[3]
2477 Lmin = Lmax = GET2(Fecode, 1);
2478 Fecode += 1 + IMM2_SIZE;
2482 case OP_TYPEMINUPTO:
2484 Lmax = GET2(Fecode, 1);
2485 reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX;
2486 Fecode += 1 + IMM2_SIZE;
2489 case OP_TYPEPOSSTAR:
2490 reptype = REPTYPE_POS;
2496 case OP_TYPEPOSPLUS:
2497 reptype = REPTYPE_POS;
2503 case OP_TYPEPOSQUERY:
2504 reptype = REPTYPE_POS;
2510 case OP_TYPEPOSUPTO:
2511 reptype = REPTYPE_POS;
2513 Lmax = GET2(Fecode, 1);
2514 Fecode += 1 + IMM2_SIZE;
2518 case OP_TYPEMINSTAR:
2520 case OP_TYPEMINPLUS:
2522 case OP_TYPEMINQUERY:
2523 fc = *Fecode++ - OP_TYPESTAR;
2526 reptype = rep_typ[fc];
2528 /* Common code for all repeated character type matches. */
2531 Lctype = *Fecode++; /* Code for the character type */
2533 #ifdef SUPPORT_UNICODE
2534 if (Lctype == OP_PROP || Lctype == OP_NOTPROP)
2536 proptype = *Fecode++;
2537 Lpropvalue = *Fecode++;
2542 /* First, ensure the minimum number of matches are present. Use inline
2543 code for maximizing the speed, and do the type test once at the start
2544 (i.e. keep it out of the loop). The code for UTF mode is separated out for
2545 tidiness, except for Unicode property tests. */
2549 #ifdef SUPPORT_UNICODE
2550 if (proptype >= 0) /* Property tests in all modes */
2555 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2556 for (i = 1; i <= Lmin; i++)
2558 if (Feptr >= mb->end_subject)
2561 RRETURN(MATCH_NOMATCH);
2563 GETCHARINCTEST(fc, Feptr);
2568 for (i = 1; i <= Lmin; i++)
2571 if (Feptr >= mb->end_subject)
2574 RRETURN(MATCH_NOMATCH);
2576 GETCHARINCTEST(fc, Feptr);
2577 chartype = UCD_CHARTYPE(fc);
2578 if ((chartype == ucp_Lu ||
2579 chartype == ucp_Ll ||
2580 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
2581 RRETURN(MATCH_NOMATCH);
2586 for (i = 1; i <= Lmin; i++)
2588 if (Feptr >= mb->end_subject)
2591 RRETURN(MATCH_NOMATCH);
2593 GETCHARINCTEST(fc, Feptr);
2594 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
2595 RRETURN(MATCH_NOMATCH);
2600 for (i = 1; i <= Lmin; i++)
2602 if (Feptr >= mb->end_subject)
2605 RRETURN(MATCH_NOMATCH);
2607 GETCHARINCTEST(fc, Feptr);
2608 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
2609 RRETURN(MATCH_NOMATCH);
2614 for (i = 1; i <= Lmin; i++)
2616 if (Feptr >= mb->end_subject)
2619 RRETURN(MATCH_NOMATCH);
2621 GETCHARINCTEST(fc, Feptr);
2622 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
2623 RRETURN(MATCH_NOMATCH);
2628 for (i = 1; i <= Lmin; i++)
2631 if (Feptr >= mb->end_subject)
2634 RRETURN(MATCH_NOMATCH);
2636 GETCHARINCTEST(fc, Feptr);
2637 category = UCD_CATEGORY(fc);
2638 if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))
2639 RRETURN(MATCH_NOMATCH);
2643 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2644 which means that Perl space and POSIX space are now identical. PCRE
2645 was changed at release 8.34. */
2647 case PT_SPACE: /* Perl space */
2648 case PT_PXSPACE: /* POSIX space */
2649 for (i = 1; i <= Lmin; i++)
2651 if (Feptr >= mb->end_subject)
2654 RRETURN(MATCH_NOMATCH);
2656 GETCHARINCTEST(fc, Feptr);
2661 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2665 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
2666 RRETURN(MATCH_NOMATCH);
2673 for (i = 1; i <= Lmin; i++)
2676 if (Feptr >= mb->end_subject)
2679 RRETURN(MATCH_NOMATCH);
2681 GETCHARINCTEST(fc, Feptr);
2682 category = UCD_CATEGORY(fc);
2683 if ((category == ucp_L || category == ucp_N ||
2684 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
2685 RRETURN(MATCH_NOMATCH);
2690 for (i = 1; i <= Lmin; i++)
2693 if (Feptr >= mb->end_subject)
2696 RRETURN(MATCH_NOMATCH);
2698 GETCHARINCTEST(fc, Feptr);
2699 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
2704 if (Lctype == OP_NOTPROP) break;
2705 RRETURN(MATCH_NOMATCH);
2709 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2717 for (i = 1; i <= Lmin; i++)
2719 if (Feptr >= mb->end_subject)
2722 RRETURN(MATCH_NOMATCH);
2724 GETCHARINCTEST(fc, Feptr);
2725 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2726 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2727 fc >= 0xe000) == (Lctype == OP_NOTPROP))
2728 RRETURN(MATCH_NOMATCH);
2732 /* This should not occur */
2735 return PCRE2_ERROR_INTERNAL;
2739 /* Match extended Unicode sequences. We will get here only if the
2740 support is in the binary; otherwise a compile-time error occurs. */
2742 else if (Lctype == OP_EXTUNI)
2744 for (i = 1; i <= Lmin; i++)
2746 if (Feptr >= mb->end_subject)
2749 RRETURN(MATCH_NOMATCH);
2753 GETCHARINCTEST(fc, Feptr);
2754 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject,
2755 mb->end_subject, utf, NULL);
2761 #endif /* SUPPORT_UNICODE */
2763 /* Handle all other cases in UTF mode */
2765 #ifdef SUPPORT_UNICODE
2766 if (utf) switch(Lctype)
2769 for (i = 1; i <= Lmin; i++)
2771 if (Feptr >= mb->end_subject)
2774 RRETURN(MATCH_NOMATCH);
2776 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
2777 if (mb->partial != 0 &&
2778 Feptr + 1 >= mb->end_subject &&
2779 NLBLOCK->nltype == NLTYPE_FIXED &&
2780 NLBLOCK->nllen == 2 &&
2781 UCHAR21(Feptr) == NLBLOCK->nl[0])
2784 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
2787 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2792 for (i = 1; i <= Lmin; i++)
2794 if (Feptr >= mb->end_subject)
2797 RRETURN(MATCH_NOMATCH);
2800 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2805 if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH);
2810 for (i = 1; i <= Lmin; i++)
2812 if (Feptr >= mb->end_subject)
2815 RRETURN(MATCH_NOMATCH);
2817 GETCHARINC(fc, Feptr);
2820 default: RRETURN(MATCH_NOMATCH);
2823 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
2835 #endif /* Not EBCDIC */
2836 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2843 for (i = 1; i <= Lmin; i++)
2845 if (Feptr >= mb->end_subject)
2848 RRETURN(MATCH_NOMATCH);
2850 GETCHARINC(fc, Feptr);
2853 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
2860 for (i = 1; i <= Lmin; i++)
2862 if (Feptr >= mb->end_subject)
2865 RRETURN(MATCH_NOMATCH);
2867 GETCHARINC(fc, Feptr);
2870 HSPACE_CASES: break;
2871 default: RRETURN(MATCH_NOMATCH);
2877 for (i = 1; i <= Lmin; i++)
2879 if (Feptr >= mb->end_subject)
2882 RRETURN(MATCH_NOMATCH);
2884 GETCHARINC(fc, Feptr);
2887 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2894 for (i = 1; i <= Lmin; i++)
2896 if (Feptr >= mb->end_subject)
2899 RRETURN(MATCH_NOMATCH);
2901 GETCHARINC(fc, Feptr);
2904 VSPACE_CASES: break;
2905 default: RRETURN(MATCH_NOMATCH);
2911 for (i = 1; i <= Lmin; i++)
2913 if (Feptr >= mb->end_subject)
2916 RRETURN(MATCH_NOMATCH);
2918 GETCHARINC(fc, Feptr);
2919 if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0)
2920 RRETURN(MATCH_NOMATCH);
2925 for (i = 1; i <= Lmin; i++)
2928 if (Feptr >= mb->end_subject)
2931 RRETURN(MATCH_NOMATCH);
2933 cc = UCHAR21(Feptr);
2934 if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0)
2935 RRETURN(MATCH_NOMATCH);
2937 /* No need to skip more code units - we know it has only one. */
2941 case OP_NOT_WHITESPACE:
2942 for (i = 1; i <= Lmin; i++)
2945 if (Feptr >= mb->end_subject)
2948 RRETURN(MATCH_NOMATCH);
2950 cc = UCHAR21(Feptr);
2951 if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)
2952 RRETURN(MATCH_NOMATCH);
2954 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2959 for (i = 1; i <= Lmin; i++)
2962 if (Feptr >= mb->end_subject)
2965 RRETURN(MATCH_NOMATCH);
2967 cc = UCHAR21(Feptr);
2968 if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0)
2969 RRETURN(MATCH_NOMATCH);
2971 /* No need to skip more code units - we know it has only one. */
2975 case OP_NOT_WORDCHAR:
2976 for (i = 1; i <= Lmin; i++)
2979 if (Feptr >= mb->end_subject)
2982 RRETURN(MATCH_NOMATCH);
2984 cc = UCHAR21(Feptr);
2985 if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)
2986 RRETURN(MATCH_NOMATCH);
2988 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2993 for (i = 1; i <= Lmin; i++)
2996 if (Feptr >= mb->end_subject)
2999 RRETURN(MATCH_NOMATCH);
3001 cc = UCHAR21(Feptr);
3002 if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0)
3003 RRETURN(MATCH_NOMATCH);
3005 /* No need to skip more code units - we know it has only one. */
3010 return PCRE2_ERROR_INTERNAL;
3011 } /* End switch(Lctype) */
3014 #endif /* SUPPORT_UNICODE */
3016 /* Code for the non-UTF case for minimum matching of operators other
3017 than OP_PROP and OP_NOTPROP. */
3022 for (i = 1; i <= Lmin; i++)
3024 if (Feptr >= mb->end_subject)
3027 RRETURN(MATCH_NOMATCH);
3029 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3030 if (mb->partial != 0 &&
3031 Feptr + 1 >= mb->end_subject &&
3032 NLBLOCK->nltype == NLTYPE_FIXED &&
3033 NLBLOCK->nllen == 2 &&
3034 *Feptr == NLBLOCK->nl[0])
3037 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3044 if (Feptr > mb->end_subject - Lmin)
3047 RRETURN(MATCH_NOMATCH);
3052 /* This OP_ANYBYTE case will never be reached because \C gets turned
3053 into OP_ALLANY in non-UTF mode. Cut out the code so that coverage
3054 reports don't complain about it's never being used. */
3057 * if (Feptr > mb->end_subject - Lmin)
3060 * RRETURN(MATCH_NOMATCH);
3066 for (i = 1; i <= Lmin; i++)
3068 if (Feptr >= mb->end_subject)
3071 RRETURN(MATCH_NOMATCH);
3075 default: RRETURN(MATCH_NOMATCH);
3078 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3087 #if PCRE2_CODE_UNIT_WIDTH != 8
3091 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
3098 for (i = 1; i <= Lmin; i++)
3100 if (Feptr >= mb->end_subject)
3103 RRETURN(MATCH_NOMATCH);
3109 #if PCRE2_CODE_UNIT_WIDTH != 8
3110 HSPACE_MULTIBYTE_CASES:
3112 RRETURN(MATCH_NOMATCH);
3118 for (i = 1; i <= Lmin; i++)
3120 if (Feptr >= mb->end_subject)
3123 RRETURN(MATCH_NOMATCH);
3127 default: RRETURN(MATCH_NOMATCH);
3129 #if PCRE2_CODE_UNIT_WIDTH != 8
3130 HSPACE_MULTIBYTE_CASES:
3138 for (i = 1; i <= Lmin; i++)
3140 if (Feptr >= mb->end_subject)
3143 RRETURN(MATCH_NOMATCH);
3148 #if PCRE2_CODE_UNIT_WIDTH != 8
3149 VSPACE_MULTIBYTE_CASES:
3151 RRETURN(MATCH_NOMATCH);
3158 for (i = 1; i <= Lmin; i++)
3160 if (Feptr >= mb->end_subject)
3163 RRETURN(MATCH_NOMATCH);
3167 default: RRETURN(MATCH_NOMATCH);
3169 #if PCRE2_CODE_UNIT_WIDTH != 8
3170 VSPACE_MULTIBYTE_CASES:
3178 for (i = 1; i <= Lmin; i++)
3180 if (Feptr >= mb->end_subject)
3183 RRETURN(MATCH_NOMATCH);
3185 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
3186 RRETURN(MATCH_NOMATCH);
3192 for (i = 1; i <= Lmin; i++)
3194 if (Feptr >= mb->end_subject)
3197 RRETURN(MATCH_NOMATCH);
3199 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
3200 RRETURN(MATCH_NOMATCH);
3205 case OP_NOT_WHITESPACE:
3206 for (i = 1; i <= Lmin; i++)
3208 if (Feptr >= mb->end_subject)
3211 RRETURN(MATCH_NOMATCH);
3213 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
3214 RRETURN(MATCH_NOMATCH);
3220 for (i = 1; i <= Lmin; i++)
3222 if (Feptr >= mb->end_subject)
3225 RRETURN(MATCH_NOMATCH);
3227 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
3228 RRETURN(MATCH_NOMATCH);
3233 case OP_NOT_WORDCHAR:
3234 for (i = 1; i <= Lmin; i++)
3236 if (Feptr >= mb->end_subject)
3239 RRETURN(MATCH_NOMATCH);
3241 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
3242 RRETURN(MATCH_NOMATCH);
3248 for (i = 1; i <= Lmin; i++)
3250 if (Feptr >= mb->end_subject)
3253 RRETURN(MATCH_NOMATCH);
3255 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
3256 RRETURN(MATCH_NOMATCH);
3262 return PCRE2_ERROR_INTERNAL;
3266 /* If Lmin = Lmax we are done. Continue with the main loop. */
3268 if (Lmin == Lmax) continue;
3270 /* If minimizing, we have to test the rest of the pattern before each
3271 subsequent match. */
3273 if (reptype == REPTYPE_MIN)
3275 #ifdef SUPPORT_UNICODE
3283 RMATCH(Fecode, RM208);
3284 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3285 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3286 if (Feptr >= mb->end_subject)
3289 RRETURN(MATCH_NOMATCH);
3291 GETCHARINCTEST(fc, Feptr);
3292 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3294 /* Control never gets here */
3300 RMATCH(Fecode, RM209);
3301 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3302 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3303 if (Feptr >= mb->end_subject)
3306 RRETURN(MATCH_NOMATCH);
3308 GETCHARINCTEST(fc, Feptr);
3309 chartype = UCD_CHARTYPE(fc);
3310 if ((chartype == ucp_Lu ||
3311 chartype == ucp_Ll ||
3312 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
3313 RRETURN(MATCH_NOMATCH);
3315 /* Control never gets here */
3320 RMATCH(Fecode, RM210);
3321 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3322 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3323 if (Feptr >= mb->end_subject)
3326 RRETURN(MATCH_NOMATCH);
3328 GETCHARINCTEST(fc, Feptr);
3329 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3330 RRETURN(MATCH_NOMATCH);
3332 /* Control never gets here */
3337 RMATCH(Fecode, RM211);
3338 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3339 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3340 if (Feptr >= mb->end_subject)
3343 RRETURN(MATCH_NOMATCH);
3345 GETCHARINCTEST(fc, Feptr);
3346 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3347 RRETURN(MATCH_NOMATCH);
3349 /* Control never gets here */
3354 RMATCH(Fecode, RM212);
3355 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3356 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3357 if (Feptr >= mb->end_subject)
3360 RRETURN(MATCH_NOMATCH);
3362 GETCHARINCTEST(fc, Feptr);
3363 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3364 RRETURN(MATCH_NOMATCH);
3366 /* Control never gets here */
3372 RMATCH(Fecode, RM213);
3373 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3374 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3375 if (Feptr >= mb->end_subject)
3378 RRETURN(MATCH_NOMATCH);
3380 GETCHARINCTEST(fc, Feptr);
3381 category = UCD_CATEGORY(fc);
3382 if ((category == ucp_L || category == ucp_N) ==
3383 (Lctype == OP_NOTPROP))
3384 RRETURN(MATCH_NOMATCH);
3386 /* Control never gets here */
3388 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
3389 which means that Perl space and POSIX space are now identical. PCRE
3390 was changed at release 8.34. */
3392 case PT_SPACE: /* Perl space */
3393 case PT_PXSPACE: /* POSIX space */
3396 RMATCH(Fecode, RM214);
3397 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3398 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3399 if (Feptr >= mb->end_subject)
3402 RRETURN(MATCH_NOMATCH);
3404 GETCHARINCTEST(fc, Feptr);
3409 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3413 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
3414 RRETURN(MATCH_NOMATCH);
3418 /* Control never gets here */
3424 RMATCH(Fecode, RM215);
3425 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3426 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3427 if (Feptr >= mb->end_subject)
3430 RRETURN(MATCH_NOMATCH);
3432 GETCHARINCTEST(fc, Feptr);
3433 category = UCD_CATEGORY(fc);
3434 if ((category == ucp_L ||
3435 category == ucp_N ||
3436 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
3437 RRETURN(MATCH_NOMATCH);
3439 /* Control never gets here */
3445 RMATCH(Fecode, RM216);
3446 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3447 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3448 if (Feptr >= mb->end_subject)
3451 RRETURN(MATCH_NOMATCH);
3453 GETCHARINCTEST(fc, Feptr);
3454 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
3459 if (Lctype == OP_NOTPROP) break;
3460 RRETURN(MATCH_NOMATCH);
3464 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3469 /* Control never gets here */
3474 RMATCH(Fecode, RM217);
3475 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3476 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3477 if (Feptr >= mb->end_subject)
3480 RRETURN(MATCH_NOMATCH);
3482 GETCHARINCTEST(fc, Feptr);
3483 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
3484 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
3485 fc >= 0xe000) == (Lctype == OP_NOTPROP))
3486 RRETURN(MATCH_NOMATCH);
3488 /* Control never gets here */
3490 /* This should never occur */
3492 return PCRE2_ERROR_INTERNAL;
3496 /* Match extended Unicode sequences. We will get here only if the
3497 support is in the binary; otherwise a compile-time error occurs. */
3499 else if (Lctype == OP_EXTUNI)
3503 RMATCH(Fecode, RM218);
3504 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3505 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3506 if (Feptr >= mb->end_subject)
3509 RRETURN(MATCH_NOMATCH);
3513 GETCHARINCTEST(fc, Feptr);
3514 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
3521 #endif /* SUPPORT_UNICODE */
3523 /* UTF mode for non-property testing character types. */
3525 #ifdef SUPPORT_UNICODE
3530 RMATCH(Fecode, RM219);
3531 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3532 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3533 if (Feptr >= mb->end_subject)
3536 RRETURN(MATCH_NOMATCH);
3538 if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3539 GETCHARINC(fc, Feptr);
3542 case OP_ANY: /* This is the non-NL case */
3543 if (mb->partial != 0 && /* Take care with CRLF partial */
3544 Feptr >= mb->end_subject &&
3545 NLBLOCK->nltype == NLTYPE_FIXED &&
3546 NLBLOCK->nllen == 2 &&
3547 fc == NLBLOCK->nl[0])
3550 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3561 default: RRETURN(MATCH_NOMATCH);
3564 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
3576 #endif /* Not EBCDIC */
3577 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
3578 RRETURN(MATCH_NOMATCH);
3586 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
3594 HSPACE_CASES: break;
3595 default: RRETURN(MATCH_NOMATCH);
3602 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
3610 VSPACE_CASES: break;
3611 default: RRETURN(MATCH_NOMATCH);
3616 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0)
3617 RRETURN(MATCH_NOMATCH);
3621 if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0)
3622 RRETURN(MATCH_NOMATCH);
3625 case OP_NOT_WHITESPACE:
3626 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0)
3627 RRETURN(MATCH_NOMATCH);
3631 if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0)
3632 RRETURN(MATCH_NOMATCH);
3635 case OP_NOT_WORDCHAR:
3636 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0)
3637 RRETURN(MATCH_NOMATCH);
3641 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0)
3642 RRETURN(MATCH_NOMATCH);
3646 return PCRE2_ERROR_INTERNAL;
3651 #endif /* SUPPORT_UNICODE */
3657 RMATCH(Fecode, RM33);
3658 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3659 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3660 if (Feptr >= mb->end_subject)
3663 RRETURN(MATCH_NOMATCH);
3665 if (Lctype == OP_ANY && IS_NEWLINE(Feptr))
3666 RRETURN(MATCH_NOMATCH);
3670 case OP_ANY: /* This is the non-NL case */
3671 if (mb->partial != 0 && /* Take care with CRLF partial */
3672 Feptr >= mb->end_subject &&
3673 NLBLOCK->nltype == NLTYPE_FIXED &&
3674 NLBLOCK->nllen == 2 &&
3675 fc == NLBLOCK->nl[0])
3678 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3689 default: RRETURN(MATCH_NOMATCH);
3692 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3701 #if PCRE2_CODE_UNIT_WIDTH != 8
3705 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
3706 RRETURN(MATCH_NOMATCH);
3716 #if PCRE2_CODE_UNIT_WIDTH != 8
3717 HSPACE_MULTIBYTE_CASES:
3719 RRETURN(MATCH_NOMATCH);
3726 default: RRETURN(MATCH_NOMATCH);
3728 #if PCRE2_CODE_UNIT_WIDTH != 8
3729 HSPACE_MULTIBYTE_CASES:
3740 #if PCRE2_CODE_UNIT_WIDTH != 8
3741 VSPACE_MULTIBYTE_CASES:
3743 RRETURN(MATCH_NOMATCH);
3750 default: RRETURN(MATCH_NOMATCH);
3752 #if PCRE2_CODE_UNIT_WIDTH != 8
3753 VSPACE_MULTIBYTE_CASES:
3760 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
3761 RRETURN(MATCH_NOMATCH);
3765 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
3766 RRETURN(MATCH_NOMATCH);
3769 case OP_NOT_WHITESPACE:
3770 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
3771 RRETURN(MATCH_NOMATCH);
3775 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
3776 RRETURN(MATCH_NOMATCH);
3779 case OP_NOT_WORDCHAR:
3780 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
3781 RRETURN(MATCH_NOMATCH);
3785 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
3786 RRETURN(MATCH_NOMATCH);
3790 return PCRE2_ERROR_INTERNAL;
3794 /* Control never gets here */
3797 /* If maximizing, it is worth using inline code for speed, doing the type
3798 test once at the start (i.e. keep it out of the loop). */
3802 Lstart_eptr = Feptr; /* Remember where we started */
3804 #ifdef SUPPORT_UNICODE
3810 for (i = Lmin; i < Lmax; i++)
3813 if (Feptr >= mb->end_subject)
3818 GETCHARLENTEST(fc, Feptr, len);
3819 if (Lctype == OP_NOTPROP) break;
3825 for (i = Lmin; i < Lmax; i++)
3829 if (Feptr >= mb->end_subject)
3834 GETCHARLENTEST(fc, Feptr, len);
3835 chartype = UCD_CHARTYPE(fc);
3836 if ((chartype == ucp_Lu ||
3837 chartype == ucp_Ll ||
3838 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
3845 for (i = Lmin; i < Lmax; i++)
3848 if (Feptr >= mb->end_subject)
3853 GETCHARLENTEST(fc, Feptr, len);
3854 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3861 for (i = Lmin; i < Lmax; i++)
3864 if (Feptr >= mb->end_subject)
3869 GETCHARLENTEST(fc, Feptr, len);
3870 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3877 for (i = Lmin; i < Lmax; i++)
3880 if (Feptr >= mb->end_subject)
3885 GETCHARLENTEST(fc, Feptr, len);
3886 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3893 for (i = Lmin; i < Lmax; i++)
3897 if (Feptr >= mb->end_subject)
3902 GETCHARLENTEST(fc, Feptr, len);
3903 category = UCD_CATEGORY(fc);
3904 if ((category == ucp_L || category == ucp_N) ==
3905 (Lctype == OP_NOTPROP))
3911 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
3912 which means that Perl space and POSIX space are now identical. PCRE
3913 was changed at release 8.34. */
3915 case PT_SPACE: /* Perl space */
3916 case PT_PXSPACE: /* POSIX space */
3917 for (i = Lmin; i < Lmax; i++)
3920 if (Feptr >= mb->end_subject)
3925 GETCHARLENTEST(fc, Feptr, len);
3930 if (Lctype == OP_NOTPROP) goto ENDLOOP99; /* Break the loop */
3934 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
3935 goto ENDLOOP99; /* Break the loop */
3944 for (i = Lmin; i < Lmax; i++)
3948 if (Feptr >= mb->end_subject)
3953 GETCHARLENTEST(fc, Feptr, len);
3954 category = UCD_CATEGORY(fc);
3955 if ((category == ucp_L || category == ucp_N ||
3956 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
3963 for (i = Lmin; i < Lmax; i++)
3967 if (Feptr >= mb->end_subject)
3972 GETCHARLENTEST(fc, Feptr, len);
3973 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
3977 { if (Lctype == OP_NOTPROP) break; else goto GOT_MAX; }
3979 { if (Lctype == OP_NOTPROP) goto GOT_MAX; else break; }
3987 for (i = Lmin; i < Lmax; i++)
3990 if (Feptr >= mb->end_subject)
3995 GETCHARLENTEST(fc, Feptr, len);
3996 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
3997 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
3998 fc >= 0xe000) == (Lctype == OP_NOTPROP))
4005 return PCRE2_ERROR_INTERNAL;
4008 /* Feptr is now past the end of the maximum run */
4010 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4012 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4013 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
4018 if (Feptr <= Lstart_eptr) break;
4019 RMATCH(Fecode, RM222);
4020 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4022 if (utf) BACKCHAR(Feptr);
4026 /* Match extended Unicode grapheme clusters. We will get here only if the
4027 support is in the binary; otherwise a compile-time error occurs. */
4029 else if (Lctype == OP_EXTUNI)
4031 for (i = Lmin; i < Lmax; i++)
4033 if (Feptr >= mb->end_subject)
4040 GETCHARINCTEST(fc, Feptr);
4041 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
4047 /* Feptr is now past the end of the maximum run */
4049 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4051 /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start
4052 of the run while backtracking because the use of \C in UTF mode can
4053 cause BACKCHAR to move back past Lstart_eptr. This is just palliative;
4054 the use of \C in UTF mode is fraught with danger. */
4061 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4062 RMATCH(Fecode, RM220);
4063 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4065 /* Backtracking over an extended grapheme cluster involves inspecting
4066 the previous two characters (if present) to see if a break is
4067 permitted between them. */
4070 if (!utf) fc = *Feptr; else
4075 rgb = UCD_GRAPHBREAK(fc);
4079 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4081 if (!utf) fc = *fptr; else
4086 lgb = UCD_GRAPHBREAK(fc);
4087 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4095 #endif /* SUPPORT_UNICODE */
4097 #ifdef SUPPORT_UNICODE
4103 for (i = Lmin; i < Lmax; i++)
4105 if (Feptr >= mb->end_subject)
4110 if (IS_NEWLINE(Feptr)) break;
4111 if (mb->partial != 0 && /* Take care with CRLF partial */
4112 Feptr + 1 >= mb->end_subject &&
4113 NLBLOCK->nltype == NLTYPE_FIXED &&
4114 NLBLOCK->nllen == 2 &&
4115 UCHAR21(Feptr) == NLBLOCK->nl[0])
4118 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4121 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4126 if (Lmax < UINT32_MAX)
4128 for (i = Lmin; i < Lmax; i++)
4130 if (Feptr >= mb->end_subject)
4136 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4141 Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */
4146 /* The "byte" (i.e. "code unit") case is the same as non-UTF */
4150 if (fc > (uint32_t)(mb->end_subject - Feptr))
4152 Feptr = mb->end_subject;
4159 for (i = Lmin; i < Lmax; i++)
4162 if (Feptr >= mb->end_subject)
4167 GETCHARLEN(fc, Feptr, len);
4170 if (++Feptr >= mb->end_subject) break;
4171 if (UCHAR21(Feptr) == CHAR_LF) Feptr++;
4175 if (fc != CHAR_LF &&
4176 (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4177 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4179 && fc != 0x2028 && fc != 0x2029
4180 #endif /* Not EBCDIC */
4190 for (i = Lmin; i < Lmax; i++)
4194 if (Feptr >= mb->end_subject)
4199 GETCHARLEN(fc, Feptr, len);
4202 HSPACE_CASES: gotspace = TRUE; break;
4203 default: gotspace = FALSE; break;
4205 if (gotspace == (Lctype == OP_NOT_HSPACE)) break;
4212 for (i = Lmin; i < Lmax; i++)
4216 if (Feptr >= mb->end_subject)
4221 GETCHARLEN(fc, Feptr, len);
4224 VSPACE_CASES: gotspace = TRUE; break;
4225 default: gotspace = FALSE; break;
4227 if (gotspace == (Lctype == OP_NOT_VSPACE)) break;
4233 for (i = Lmin; i < Lmax; i++)
4236 if (Feptr >= mb->end_subject)
4241 GETCHARLEN(fc, Feptr, len);
4242 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break;
4248 for (i = Lmin; i < Lmax; i++)
4251 if (Feptr >= mb->end_subject)
4256 GETCHARLEN(fc, Feptr, len);
4257 if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break;
4262 case OP_NOT_WHITESPACE:
4263 for (i = Lmin; i < Lmax; i++)
4266 if (Feptr >= mb->end_subject)
4271 GETCHARLEN(fc, Feptr, len);
4272 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break;
4278 for (i = Lmin; i < Lmax; i++)
4281 if (Feptr >= mb->end_subject)
4286 GETCHARLEN(fc, Feptr, len);
4287 if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break;
4292 case OP_NOT_WORDCHAR:
4293 for (i = Lmin; i < Lmax; i++)
4296 if (Feptr >= mb->end_subject)
4301 GETCHARLEN(fc, Feptr, len);
4302 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break;
4308 for (i = Lmin; i < Lmax; i++)
4311 if (Feptr >= mb->end_subject)
4316 GETCHARLEN(fc, Feptr, len);
4317 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break;
4323 return PCRE2_ERROR_INTERNAL;
4326 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4328 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4329 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go
4334 if (Feptr <= Lstart_eptr) break;
4335 RMATCH(Fecode, RM221);
4336 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4339 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr &&
4340 UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR)
4345 #endif /* SUPPORT_UNICODE */
4352 for (i = Lmin; i < Lmax; i++)
4354 if (Feptr >= mb->end_subject)
4359 if (IS_NEWLINE(Feptr)) break;
4360 if (mb->partial != 0 && /* Take care with CRLF partial */
4361 Feptr + 1 >= mb->end_subject &&
4362 NLBLOCK->nltype == NLTYPE_FIXED &&
4363 NLBLOCK->nllen == 2 &&
4364 *Feptr == NLBLOCK->nl[0])
4367 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4376 if (fc > (uint32_t)(mb->end_subject - Feptr))
4378 Feptr = mb->end_subject;
4385 for (i = Lmin; i < Lmax; i++)
4387 if (Feptr >= mb->end_subject)
4395 if (++Feptr >= mb->end_subject) break;
4396 if (*Feptr == CHAR_LF) Feptr++;
4400 if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4401 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4402 #if PCRE2_CODE_UNIT_WIDTH != 8
4403 && fc != 0x2028 && fc != 0x2029
4412 for (i = Lmin; i < Lmax; i++)
4414 if (Feptr >= mb->end_subject)
4421 default: Feptr++; break;
4423 #if PCRE2_CODE_UNIT_WIDTH != 8
4424 HSPACE_MULTIBYTE_CASES:
4433 for (i = Lmin; i < Lmax; i++)
4435 if (Feptr >= mb->end_subject)
4442 default: goto ENDLOOP01;
4444 #if PCRE2_CODE_UNIT_WIDTH != 8
4445 HSPACE_MULTIBYTE_CASES:
4454 for (i = Lmin; i < Lmax; i++)
4456 if (Feptr >= mb->end_subject)
4463 default: Feptr++; break;
4465 #if PCRE2_CODE_UNIT_WIDTH != 8
4466 VSPACE_MULTIBYTE_CASES:
4475 for (i = Lmin; i < Lmax; i++)
4477 if (Feptr >= mb->end_subject)
4484 default: goto ENDLOOP03;
4486 #if PCRE2_CODE_UNIT_WIDTH != 8
4487 VSPACE_MULTIBYTE_CASES:
4496 for (i = Lmin; i < Lmax; i++)
4498 if (Feptr >= mb->end_subject)
4503 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
4510 for (i = Lmin; i < Lmax; i++)
4512 if (Feptr >= mb->end_subject)
4517 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
4523 case OP_NOT_WHITESPACE:
4524 for (i = Lmin; i < Lmax; i++)
4526 if (Feptr >= mb->end_subject)
4531 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
4538 for (i = Lmin; i < Lmax; i++)
4540 if (Feptr >= mb->end_subject)
4545 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
4551 case OP_NOT_WORDCHAR:
4552 for (i = Lmin; i < Lmax; i++)
4554 if (Feptr >= mb->end_subject)
4559 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
4566 for (i = Lmin; i < Lmax; i++)
4568 if (Feptr >= mb->end_subject)
4573 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
4580 return PCRE2_ERROR_INTERNAL;
4583 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4587 if (Feptr == Lstart_eptr) break;
4588 RMATCH(Fecode, RM34);
4589 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4591 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF &&
4592 Feptr[-1] == CHAR_CR) Feptr--;
4596 break; /* End of repeat character type processing */
4605 /* ===================================================================== */
4606 /* Match a back reference, possibly repeatedly. Look past the end of the
4607 item to see if there is repeat information following. The OP_REF and
4608 OP_REFI opcodes are used for a reference to a numbered group or to a
4609 non-duplicated named group. For a duplicated named group, OP_DNREF and
4610 OP_DNREFI are used. In this case we must scan the list of groups to which
4611 the name refers, and use the first one that is set. */
4613 #define Lmin F->temp_32[0]
4614 #define Lmax F->temp_32[1]
4615 #define Lcaseless F->temp_32[2]
4616 #define Lstart F->temp_sptr[0]
4617 #define Loffset F->temp_size
4621 Lcaseless = (Fop == OP_DNREFI);
4623 int count = GET2(Fecode, 1+IMM2_SIZE);
4624 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
4625 Fecode += 1 + 2*IMM2_SIZE;
4629 Loffset = (GET2(slot, 0) << 1) - 2;
4630 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break;
4631 slot += mb->name_entry_size;
4638 Lcaseless = (Fop == OP_REFI);
4639 Loffset = (GET2(Fecode, 1) << 1) - 2;
4640 Fecode += 1 + IMM2_SIZE;
4642 /* Set up for repetition, or handle the non-repeated case. The maximum and
4643 minimum must be in the heap frame, but as they are short-term values, we
4644 use temporary fields. */
4655 fc = *Fecode++ - OP_CRSTAR;
4658 reptype = rep_typ[fc];
4663 Lmin = GET2(Fecode, 1);
4664 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
4665 reptype = rep_typ[*Fecode - OP_CRSTAR];
4666 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
4667 Fecode += 1 + 2 * IMM2_SIZE;
4670 default: /* No repeat follows */
4672 rrc = match_ref(Loffset, Lcaseless, F, mb, &length);
4675 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4677 RRETURN(MATCH_NOMATCH);
4681 continue; /* With the main loop */
4684 /* Handle repeated back references. If a set group has length zero, just
4685 continue with the main loop, because it matches however many times. For an
4686 unset reference, if the minimum is zero, we can also just continue. We can
4687 also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset
4688 group behave as a zero-length group. For any other unset cases, carrying
4689 on will result in NOMATCH. */
4691 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET)
4693 if (Fovector[Loffset] == Fovector[Loffset + 1]) continue;
4695 else /* Group is not set */
4697 if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
4701 /* First, ensure the minimum number of matches are present. */
4703 for (i = 1; i <= Lmin; i++)
4706 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4709 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4711 RRETURN(MATCH_NOMATCH);
4716 /* If min = max, we are done. They are not both allowed to be zero. */
4718 if (Lmin == Lmax) continue;
4720 /* If minimizing, keep trying and advancing the pointer. */
4722 if (reptype == REPTYPE_MIN)
4727 RMATCH(Fecode, RM20);
4728 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4729 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
4730 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4733 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4735 RRETURN(MATCH_NOMATCH);
4739 /* Control never gets here */
4742 /* If maximizing, find the longest string and work backwards, as long as
4743 the matched lengths for each iteration are the same. */
4747 BOOL samelengths = TRUE;
4748 Lstart = Feptr; /* Starting position */
4749 Flength = Fovector[Loffset+1] - Fovector[Loffset];
4751 for (i = Lmin; i < Lmax; i++)
4754 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4757 /* Can't use CHECK_PARTIAL because we don't want to update Feptr in
4758 the soft partial matching case. */
4760 if (rrc > 0 && mb->partial != 0 &&
4761 mb->end_subject > mb->start_used_ptr)
4764 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4769 if (slength != Flength) samelengths = FALSE;
4773 /* If the length matched for each repetition is the same as the length of
4774 the captured group, we can easily work backwards. This is the normal
4775 case. However, in caseless UTF-8 mode there are pairs of case-equivalent
4776 characters whose lengths (in terms of code units) differ. However, this
4777 is very rare, so we handle it by re-matching fewer and fewer times. */
4781 while (Feptr >= Lstart)
4783 RMATCH(Fecode, RM21);
4784 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4789 /* The rare case of non-matching lengths. Re-scan the repetition for each
4790 iteration. We know that match_ref() will succeed every time. */
4797 RMATCH(Fecode, RM22);
4798 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4799 if (Feptr == Lstart) break; /* Failed after minimal repetition */
4802 for (i = Lmin; i < Lmax; i++)
4805 (void)match_ref(Loffset, Lcaseless, F, mb, &slength);
4811 RRETURN(MATCH_NOMATCH);
4813 /* Control never gets here */
4823 /* ========================================================================= */
4824 /* Opcodes for the start of various parenthesized items */
4825 /* ========================================================================= */
4827 /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the
4828 (*THEN) is within the current branch by comparing the address of OP_THEN
4829 that is passed back with the end of the branch. If (*THEN) is within the
4830 current branch, and the branch is one of two or more alternatives (it
4831 either starts or ends with OP_ALT), we have reached the limit of THEN's
4832 action, so convert the return code to NOMATCH, which will cause normal
4833 backtracking to happen from now on. Otherwise, THEN is passed back to an
4834 outer alternative. This implements Perl's treatment of parenthesized
4835 groups, where a group not containing | does not affect the current
4836 alternative, that is, (X) is NOT the same as (X|(*F)). */
4839 /* ===================================================================== */
4840 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive
4841 bracket group, indicating that it may occur zero times. It may repeat
4842 infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in
4843 the pattern. Brackets with fixed upper repeat limits are compiled as a
4844 number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO.
4845 Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */
4847 #define Lnext_ecode F->temp_sptr[0]
4850 Lnext_ecode = Fecode + 1;
4851 RMATCH(Lnext_ecode, RM9);
4852 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4853 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
4854 Fecode = Lnext_ecode + 1 + LINK_SIZE;
4858 Lnext_ecode = Fecode + 1;
4859 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
4860 RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10);
4861 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4869 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
4870 Fecode += 1 + LINK_SIZE;
4874 /* ===================================================================== */
4875 /* Handle possessive brackets with an unlimited repeat. The end of these
4876 brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without
4877 going further in the pattern. */
4879 #define Lframe_type F->temp_32[0]
4880 #define Lmatched_once F->temp_32[1]
4881 #define Lzero_allowed F->temp_32[2]
4882 #define Lstart_eptr F->temp_sptr[0]
4883 #define Lstart_group F->temp_sptr[1]
4886 Lzero_allowed = TRUE; /* Zero repeat is allowed */
4888 if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS)
4889 goto POSSESSIVE_CAPTURE;
4890 goto POSSESSIVE_NON_CAPTURE;
4894 Lzero_allowed = FALSE; /* Zero repeat not allowed */
4896 POSSESSIVE_NON_CAPTURE:
4897 Lframe_type = GF_NOCAPTURE; /* Remembered frame type */
4898 goto POSSESSIVE_GROUP;
4902 Lzero_allowed = FALSE; /* Zero repeat not allowed */
4905 number = GET2(Fecode, 1+LINK_SIZE);
4906 Lframe_type = GF_CAPTURE | number; /* Remembered frame type */
4909 Lmatched_once = FALSE; /* Never matched */
4910 Lstart_group = Fecode; /* Start of this group */
4914 Lstart_eptr = Feptr; /* Position at group start */
4915 group_frame_type = Lframe_type;
4916 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8);
4917 if (rrc == MATCH_KETRPOS)
4919 Lmatched_once = TRUE; /* Matched at least once */
4920 if (Feptr == Lstart_eptr) /* Empty match; skip to end */
4922 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
4926 Fecode = Lstart_group;
4930 /* See comment above about handling THEN. */
4932 if (rrc == MATCH_THEN)
4934 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
4935 if (mb->verb_ecode_ptr < next_ecode &&
4936 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
4937 rrc = MATCH_NOMATCH;
4940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4941 Fecode += GET(Fecode, 1);
4942 if (*Fecode != OP_ALT) break;
4945 /* Success if matched something or zero repeat allowed */
4947 if (Lmatched_once || Lzero_allowed)
4949 Fecode += 1 + LINK_SIZE;
4953 RRETURN(MATCH_NOMATCH);
4955 #undef Lmatched_once
4956 #undef Lzero_allowed
4962 /* ===================================================================== */
4963 /* Handle non-capturing brackets that cannot match an empty string. When we
4964 get to the final alternative within the brackets, as long as there are no
4965 THEN's in the pattern, we can optimize by not recording a new backtracking
4966 point. (Ideally we should test for a THEN within this group, but we don't
4967 have that information.) Don't do this if we are at the very top level,
4968 however, because that would make handling assertions and once-only brackets
4969 messier when there is nothing to go back to. */
4971 #define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */
4972 #define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */
4975 if (mb->hasthen || Frdepth == 0)
4983 Lnext_branch = Fecode + GET(Fecode, 1);
4984 if (*Lnext_branch != OP_ALT) break;
4986 /* This is never the final branch. We do not need to test for MATCH_THEN
4987 here because this code is not used when there is a THEN in the pattern. */
4989 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1);
4990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4991 Fecode = Lnext_branch;
4994 /* Hit the start of the final branch. Continue at this level. */
4996 Fecode += PRIV(OP_lengths)[*Fecode];
5002 /* ===================================================================== */
5003 /* Handle a capturing bracket, other than those that are possessive with an
5004 unlimited repeat. */
5008 Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE);
5012 /* ===================================================================== */
5013 /* Atomic groups and non-capturing brackets that can match an empty string
5014 must record a backtracking point and also set up a chained frame. */
5018 Lframe_type = GF_NOCAPTURE | Fop;
5023 group_frame_type = Lframe_type;
5024 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2);
5025 if (rrc == MATCH_THEN)
5027 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5028 if (mb->verb_ecode_ptr < next_ecode &&
5029 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
5030 rrc = MATCH_NOMATCH;
5032 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5033 Fecode += GET(Fecode, 1);
5034 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5036 /* Control never reaches here. */
5041 /* ===================================================================== */
5042 /* Recursion either matches the current regex, or some subexpression. The
5043 offset data is the offset to the starting bracket from the start of the
5044 whole pattern. (This is so that it works from duplicated subpatterns.) */
5046 #define Lframe_type F->temp_32[0]
5047 #define Lstart_branch F->temp_sptr[0]
5050 bracode = mb->start_code + GET(Fecode, 1);
5051 number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);
5053 /* If we are already in a recursion, check for repeating the same one
5054 without advancing the subject pointer. This should catch convoluted mutual
5055 recursions. (Some simple cases are caught at compile time.) */
5057 if (Fcurrent_recurse != RECURSE_UNSET)
5059 offset = Flast_group_offset;
5060 while (offset != PCRE2_UNSET)
5062 N = (heapframe *)((char *)mb->match_frames + offset);
5063 P = (heapframe *)((char *)N - frame_size);
5064 if (N->group_frame_type == (GF_RECURSE | number))
5066 if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP;
5069 offset = P->last_group_offset;
5073 /* Now run the recursion, branch by branch. */
5075 Lstart_branch = bracode;
5076 Lframe_type = GF_RECURSE | number;
5080 PCRE2_SPTR next_ecode;
5082 group_frame_type = Lframe_type;
5083 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);
5084 next_ecode = Lstart_branch + GET(Lstart_branch,1);
5086 /* Handle backtracking verbs, which are defined in a range that can
5087 easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to
5088 escape beyond a recursion; they cause a NOMATCH for the entire recursion.
5090 When one of these verbs triggers, the current recursion group number is
5091 recorded. If it matches the recursion we are processing, the verb
5092 happened within the recursion and we must deal with it. Otherwise it must
5093 have happened after the recursion completed, and so has to be passed
5094 back. See comment above about handling THEN. */
5096 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&
5097 mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))
5099 if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&
5100 (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))
5101 rrc = MATCH_NOMATCH;
5102 else RRETURN(MATCH_NOMATCH);
5105 /* Note that carrying on after (*ACCEPT) in a recursion is handled in the
5106 OP_ACCEPT code. Nothing needs to be done here. */
5108 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5109 Lstart_branch = next_ecode;
5110 if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);
5112 /* Control never reaches here. */
5115 #undef Lstart_branch
5118 /* ===================================================================== */
5119 /* Positive assertions are like other groups except that PCRE doesn't allow
5120 the effect of (*THEN) to escape beyond an assertion; it is therefore
5121 treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its
5122 captures and mark retained. Any other return is an error. */
5124 #define Lframe_type F->temp_32[0]
5128 Lframe_type = GF_NOCAPTURE | Fop;
5131 group_frame_type = Lframe_type;
5132 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3);
5133 if (rrc == MATCH_ACCEPT)
5136 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5137 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5138 Foffset_top = assert_accept_frame->offset_top;
5139 Fmark = assert_accept_frame->mark;
5142 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
5143 Fecode += GET(Fecode, 1);
5144 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5147 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5148 Fecode += 1 + LINK_SIZE;
5154 /* ===================================================================== */
5155 /* Handle negative assertions. Loop for each non-matching branch as for
5156 positive assertions. */
5158 #define Lframe_type F->temp_32[0]
5161 case OP_ASSERTBACK_NOT:
5162 Lframe_type = GF_NOCAPTURE | Fop;
5166 group_frame_type = Lframe_type;
5167 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4);
5170 case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */
5172 RRETURN (MATCH_NOMATCH);
5174 case MATCH_NOMATCH: /* Branch failed, try next if present. */
5176 Fecode += GET(Fecode, 1);
5177 if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED;
5180 case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */
5183 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5184 goto ASSERT_NOT_FAILED;
5186 default: /* Pass back any other return */
5191 /* None of the branches have matched or there was a backtrack to (*COMMIT),
5192 (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a
5193 negative assertion, so carry on. */
5196 Fecode += 1 + LINK_SIZE;
5202 /* ===================================================================== */
5203 /* The callout item calls an external function, if one is provided, passing
5204 details of the match so far. This is mainly for debugging, though the
5205 function is able to force a failure. */
5208 case OP_CALLOUT_STR:
5209 rrc = do_callout(F, mb, &length);
5210 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5211 if (rrc < 0) RRETURN(rrc);
5216 /* ===================================================================== */
5217 /* Conditional group: compilation checked that there are no more than two
5218 branches. If the condition is false, skipping the first branch takes us
5219 past the end of the item if there is only one branch, but that's exactly
5225 /* The variable Flength will be added to Fecode when the condition is
5226 false, to get to the second branch. Setting it to the offset to the ALT or
5227 KET, then incrementing Fecode achieves this effect. However, if the second
5228 branch is non-existent, we must point to the KET so that the end of the
5229 group is correctly processed. We now have Fecode pointing to the condition
5232 Flength = GET(Fecode, 1); /* Offset to the second branch */
5233 if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE;
5234 Fecode += 1 + LINK_SIZE; /* From this opcode */
5236 /* Because of the way auto-callout works during compile, a callout item is
5237 inserted between OP_COND and an assertion condition. Such a callout can
5238 also be inserted manually. */
5240 if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR)
5242 rrc = do_callout(F, mb, &length);
5243 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5244 if (rrc < 0) RRETURN(rrc);
5246 /* Advance Fecode past the callout, so it now points to the condition. We
5247 must adjust Flength so that the value of Fecode+Flength is unchanged. */
5253 /* Test the various possible conditions */
5258 case OP_RREF: /* Group recursion test */
5259 if (Fcurrent_recurse != RECURSE_UNSET)
5261 number = GET2(Fecode, 1);
5262 condition = (number == RREF_ANY || number == Fcurrent_recurse);
5266 case OP_DNRREF: /* Duplicate named group recursion test */
5267 if (Fcurrent_recurse != RECURSE_UNSET)
5269 int count = GET2(Fecode, 1 + IMM2_SIZE);
5270 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5273 number = GET2(slot, 0);
5274 condition = number == Fcurrent_recurse;
5275 if (condition) break;
5276 slot += mb->name_entry_size;
5281 case OP_CREF: /* Numbered group used test */
5282 offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */
5283 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5286 case OP_DNCREF: /* Duplicate named group used test */
5288 int count = GET2(Fecode, 1 + IMM2_SIZE);
5289 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5292 offset = (GET2(slot, 0) << 1) - 2;
5293 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5294 if (condition) break;
5295 slot += mb->name_entry_size;
5301 case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
5308 /* The condition is an assertion. Run code similar to the assertion code
5311 #define Lpositive F->temp_32[0]
5312 #define Lstart_branch F->temp_sptr[0]
5315 Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK);
5316 Lstart_branch = Fecode;
5320 group_frame_type = GF_CONDASSERT | *Fecode;
5321 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5);
5325 case MATCH_ACCEPT: /* Save captures */
5327 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5328 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5329 Foffset_top = assert_accept_frame->offset_top;
5332 /* In the case of a match, the captures have already been put into
5333 the current frame. */
5336 condition = Lpositive; /* TRUE for positive assertion */
5339 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
5340 assertion; it is therefore always treated as NOMATCH. */
5344 Lstart_branch += GET(Lstart_branch, 1);
5345 if (*Lstart_branch == OP_ALT) continue; /* Try next branch */
5346 condition = !Lpositive; /* TRUE for negative assertion */
5349 /* These force no match without checking other branches. */
5354 condition = !Lpositive;
5360 break; /* Out of the branch loop */
5363 /* If the condition is true, find the end of the assertion so that
5364 advancing past it gets us to the start of the first branch. */
5368 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5370 break; /* End of assertion condition */
5374 #undef Lstart_branch
5376 /* Choose branch according to the condition. */
5378 Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength;
5380 /* If the opcode is OP_SCOND it means we are at a repeated conditional
5381 group that might match an empty string. We must therefore descend a level
5382 so that the start is remembered for checking. For OP_COND we can just
5383 continue at this level. */
5385 if (Fop == OP_SCOND)
5387 group_frame_type = GF_NOCAPTURE | Fop;
5388 RMATCH(Fecode, RM35);
5395 /* ========================================================================= */
5396 /* End of start of parenthesis opcodes */
5397 /* ========================================================================= */
5400 /* ===================================================================== */
5401 /* Move the subject pointer back. This occurs only at the start of each
5402 branch of a lookbehind assertion. If we are too close to the start to move
5403 back, fail. When working with UTF-8 we move back a number of characters,
5407 number = GET(Fecode, 1);
5408 #ifdef SUPPORT_UNICODE
5411 while (number-- > 0)
5413 if (Feptr <= mb->start_subject) RRETURN(MATCH_NOMATCH);
5421 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
5424 if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
5428 /* Save the earliest consulted character, then skip to next opcode */
5430 if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;
5431 Fecode += 1 + LINK_SIZE;
5435 /* ===================================================================== */
5436 /* An alternation is the end of a branch; scan along to find the end of the
5440 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
5444 /* ===================================================================== */
5445 /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
5446 starting frame was added to the chained frames in order to remember the
5447 starting subject position for the group. */
5454 bracode = Fecode - GET(Fecode, 1);
5456 /* Point N to the frame at the start of the most recent group.
5457 Remember the subject pointer at the start of the group. */
5459 if (*bracode != OP_BRA && *bracode != OP_COND)
5461 N = (heapframe *)((char *)mb->match_frames + Flast_group_offset);
5462 P = (heapframe *)((char *)N - frame_size);
5463 Flast_group_offset = P->last_group_offset;
5465 #ifdef DEBUG_SHOW_RMATCH
5466 fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n",
5467 N->rdepth, N->group_frame_type,
5468 (char *)P->eptr - (char *)mb->start_subject);
5471 /* If we are at the end of an assertion that is a condition, return a
5472 match, discarding any intermediate backtracking points. Copy back the
5473 captures into the frame before N so that they are set on return. Doing
5474 this for all assertions, both positive and negative, seems to match what
5477 if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)
5479 memcpy((char *)P + offsetof(heapframe, ovector), Fovector,
5480 Foffset_top * sizeof(PCRE2_SIZE));
5481 P->offset_top = Foffset_top;
5482 Fback_frame = (char *)F - (char *)P;
5483 RRETURN(MATCH_MATCH);
5486 else P = NULL; /* Indicates starting frame not recorded */
5488 /* The group was not a conditional assertion. */
5492 case OP_BRA: /* No need to do anything for these */
5497 /* Positive assertions are like OP_ONCE, except that in addition the
5498 subject pointer must be put back to where it was at the start of the
5503 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
5507 /* For an atomic group, discard internal backtracking points. We must
5508 also ensure that any remaining branches within the top-level of the group
5509 are not tried. Do this by adjusting the code pointer within the backtrack
5510 frame so that it points to the final branch. */
5513 Fback_frame = ((char *)F - (char *)P);
5516 uint32_t y = GET(P->ecode,1);
5517 if ((P->ecode)[y] != OP_ALT) break;
5522 /* A matching negative assertion returns MATCH, which is turned into
5523 NOMATCH at the assertion level. */
5526 case OP_ASSERTBACK_NOT:
5527 RRETURN(MATCH_MATCH);
5529 /* Whole-pattern recursion is coded as a recurse into group 0, so it
5530 won't be picked up here. Instead, we catch it when the OP_END is reached.
5531 Other recursion is handled here. */
5537 number = GET2(bracode, 1+LINK_SIZE);
5539 /* Handle a recursively called group. We reinstate the previous set of
5540 captures and then carry on after the recursion call. */
5542 if (Fcurrent_recurse == number)
5544 P = (heapframe *)((char *)N - frame_size);
5545 memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
5546 P->offset_top * sizeof(PCRE2_SIZE));
5547 Foffset_top = P->offset_top;
5548 Fcapture_last = P->capture_last;
5549 Fcurrent_recurse = P->current_recurse;
5550 Fecode = P->ecode + 1 + LINK_SIZE;
5551 continue; /* With next opcode */
5554 /* Deal with actual capturing. */
5556 offset = (number << 1) - 2;
5557 Fcapture_last = number;
5558 Fovector[offset] = P->eptr - mb->start_subject;
5559 Fovector[offset+1] = Feptr - mb->start_subject;
5560 if (offset >= Foffset_top) Foffset_top = offset + 2;
5562 } /* End actions relating to the starting opcode */
5564 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
5565 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
5566 at a time from the outer level. This must precede the empty string test -
5567 in this case that test is done at the outer level. */
5569 if (*Fecode == OP_KETRPOS)
5571 memcpy((char *)P + offsetof(heapframe, eptr),
5572 (char *)F + offsetof(heapframe, eptr),
5574 RRETURN(MATCH_KETRPOS);
5577 /* Handle the different kinds of closing brackets. A non-repeating ket
5578 needs no special action, just continuing at this level. This also happens
5579 for the repeating kets if the group matched no characters, in order to
5580 forcibly break infinite loops. Otherwise, the repeating kets try the rest
5581 of the pattern or restart from the preceding bracket, in the appropriate
5584 if (Fop != OP_KET && (P == NULL || Feptr != P->eptr))
5586 if (Fop == OP_KETRMIN)
5588 RMATCH(Fecode + 1 + LINK_SIZE, RM6);
5589 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5590 Fecode -= GET(Fecode, 1);
5591 break; /* End of ket processing */
5594 /* Repeat the maximum number of times (KETRMAX) */
5596 RMATCH(bracode, RM7);
5597 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5600 /* Carry on at this level for a non-repeating ket, or after matching an
5601 empty string, or after repeating for a maximum number of times. */
5603 Fecode += 1 + LINK_SIZE;
5607 /* ===================================================================== */
5608 /* Start and end of line assertions, not multiline mode. */
5610 case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */
5611 if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0)
5612 RRETURN(MATCH_NOMATCH);
5616 case OP_SOD: /* Unconditional start of subject */
5617 if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH);
5621 /* When PCRE2_NOTEOL is unset, assert before the subject end, or a
5622 terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */
5625 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
5626 if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
5629 /* Unconditional end of subject assertion (\z) */
5632 if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
5637 /* End of subject or ending \n assertion (\Z) */
5641 if (Feptr < mb->end_subject &&
5642 (!IS_NEWLINE(Feptr) || Feptr != mb->end_subject - mb->nllen))
5644 if (mb->partial != 0 &&
5645 Feptr + 1 >= mb->end_subject &&
5646 NLBLOCK->nltype == NLTYPE_FIXED &&
5647 NLBLOCK->nllen == 2 &&
5648 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
5651 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5653 RRETURN(MATCH_NOMATCH);
5656 /* Either at end of string or \n before end. */
5663 /* ===================================================================== */
5664 /* Start and end of line assertions, multiline mode. */
5666 /* Start of subject unless notbol, or after any newline except for one at
5667 the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */
5670 if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject)
5671 RRETURN(MATCH_NOMATCH);
5672 if (Feptr != mb->start_subject &&
5673 ((Feptr == mb->end_subject &&
5674 (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||
5675 !WAS_NEWLINE(Feptr)))
5676 RRETURN(MATCH_NOMATCH);
5680 /* Assert before any newline, or before end of subject unless noteol is
5684 if (Feptr < mb->end_subject)
5686 if (!IS_NEWLINE(Feptr))
5688 if (mb->partial != 0 &&
5689 Feptr + 1 >= mb->end_subject &&
5690 NLBLOCK->nltype == NLTYPE_FIXED &&
5691 NLBLOCK->nllen == 2 &&
5692 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
5695 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5697 RRETURN(MATCH_NOMATCH);
5702 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
5709 /* ===================================================================== */
5710 /* Start of match assertion */
5713 if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);
5718 /* ===================================================================== */
5719 /* Reset the start of match point */
5722 Fstart_match = Feptr;
5727 /* ===================================================================== */
5728 /* Word boundary assertions. Find out if the previous and current
5729 characters are "word" characters. It takes a bit more work in UTF mode.
5730 Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is
5731 not set. When it is set, use Unicode properties if available, even when not
5732 in UTF mode. Remember the earliest and latest consulted characters. */
5734 case OP_NOT_WORD_BOUNDARY:
5735 case OP_WORD_BOUNDARY:
5736 if (Feptr == mb->start_subject) prev_is_word = FALSE; else
5738 PCRE2_SPTR lastptr = Feptr - 1;
5739 #ifdef SUPPORT_UNICODE
5743 GETCHAR(fc, lastptr);
5746 #endif /* SUPPORT_UNICODE */
5748 if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
5749 #ifdef SUPPORT_UNICODE
5750 if ((mb->poptions & PCRE2_UCP) != 0)
5752 if (fc == '_') prev_is_word = TRUE; else
5754 int cat = UCD_CATEGORY(fc);
5755 prev_is_word = (cat == ucp_L || cat == ucp_N);
5759 #endif /* SUPPORT_UNICODE */
5760 prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
5763 /* Get status of next character */
5765 if (Feptr >= mb->end_subject)
5768 cur_is_word = FALSE;
5772 PCRE2_SPTR nextptr = Feptr + 1;
5773 #ifdef SUPPORT_UNICODE
5776 FORWARDCHARTEST(nextptr, mb->end_subject);
5780 #endif /* SUPPORT_UNICODE */
5782 if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
5783 #ifdef SUPPORT_UNICODE
5784 if ((mb->poptions & PCRE2_UCP) != 0)
5786 if (fc == '_') cur_is_word = TRUE; else
5788 int cat = UCD_CATEGORY(fc);
5789 cur_is_word = (cat == ucp_L || cat == ucp_N);
5793 #endif /* SUPPORT_UNICODE */
5794 cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
5797 /* Now see if the situation is what we want */
5799 if ((*Fecode++ == OP_WORD_BOUNDARY)?
5800 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
5801 RRETURN(MATCH_NOMATCH);
5805 /* ===================================================================== */
5806 /* Backtracking (*VERB)s, with and without arguments. Note that if the
5807 pattern is successfully matched, we do not come back from RMATCH. */
5810 Fmark = mb->nomatch_mark = Fecode + 2;
5811 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12);
5813 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
5814 argument, and we must check whether that argument matches this MARK's
5815 argument. It is passed back in mb->verb_skip_ptr. If it does match, we
5816 return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject
5817 position that corresponds to this mark. Otherwise, pass back the return
5820 if (rrc == MATCH_SKIP_ARG &&
5821 PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0)
5823 mb->verb_skip_ptr = Feptr; /* Pass back current position */
5824 RRETURN(MATCH_SKIP);
5829 RRETURN(MATCH_NOMATCH);
5831 /* Record the current recursing group number in mb->verb_current_recurse
5832 when a backtracking return such as MATCH_COMMIT is given. This enables the
5833 recurse processing to catch verbs from within the recursion. */
5836 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);
5837 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5838 mb->verb_current_recurse = Fcurrent_recurse;
5839 RRETURN(MATCH_COMMIT);
5842 Fmark = mb->nomatch_mark = Fecode + 2;
5843 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36);
5844 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5845 mb->verb_current_recurse = Fcurrent_recurse;
5846 RRETURN(MATCH_COMMIT);
5849 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);
5850 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5851 mb->verb_current_recurse = Fcurrent_recurse;
5852 RRETURN(MATCH_PRUNE);
5855 Fmark = mb->nomatch_mark = Fecode + 2;
5856 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);
5857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5858 mb->verb_current_recurse = Fcurrent_recurse;
5859 RRETURN(MATCH_PRUNE);
5862 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);
5863 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5864 mb->verb_skip_ptr = Feptr; /* Pass back current position */
5865 mb->verb_current_recurse = Fcurrent_recurse;
5866 RRETURN(MATCH_SKIP);
5868 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
5869 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
5870 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
5871 that failed and any that precede it (either they also failed, or were not
5872 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
5873 SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg
5874 set to the count of the one that failed. */
5877 mb->skip_arg_count++;
5878 if (mb->skip_arg_count <= mb->ignore_skip_arg)
5880 Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1];
5883 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17);
5884 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5886 /* Pass back the current skip name and return the special MATCH_SKIP_ARG
5887 return code. This will either be caught by a matching MARK, or get to the
5888 top, where it causes a rematch with mb->ignore_skip_arg set to the value of
5889 mb->skip_arg_count. */
5891 mb->verb_skip_ptr = Fecode + 2;
5892 mb->verb_current_recurse = Fcurrent_recurse;
5893 RRETURN(MATCH_SKIP_ARG);
5895 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
5896 the branch in which it occurs can be determined. */
5899 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);
5900 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5901 mb->verb_ecode_ptr = Fecode;
5902 mb->verb_current_recurse = Fcurrent_recurse;
5903 RRETURN(MATCH_THEN);
5906 Fmark = mb->nomatch_mark = Fecode + 2;
5907 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);
5908 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5909 mb->verb_ecode_ptr = Fecode;
5910 mb->verb_current_recurse = Fcurrent_recurse;
5911 RRETURN(MATCH_THEN);
5914 /* ===================================================================== */
5915 /* There's been some horrible disaster. Arrival here can only mean there is
5916 something seriously wrong in the code above or the OP_xxx definitions. */
5919 return PCRE2_ERROR_INTERNAL;
5922 /* Do not insert any code in here without much thought; it is assumed
5923 that "continue" in the code above comes out to here to repeat the main
5926 } /* End of main loop */
5927 /* Control never reaches here */
5930 /* ========================================================================= */
5931 /* The RRETURN() macro jumps here. The number that is saved in Freturn_id
5932 indicates which label we actually want to return to. The value in Frdepth is
5933 the index number of the frame in the vector. The return value has been placed
5936 #define LBL(val) case val: goto L_RM##val;
5939 if (Frdepth == 0) return rrc; /* Exit from the top level */
5940 F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */
5941 mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
5943 #ifdef DEBUG_SHOW_RMATCH
5944 fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id);
5949 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5950 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
5951 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
5952 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
5953 LBL(33) LBL(34) LBL(35) LBL(36)
5955 #ifdef SUPPORT_WIDE_CHARS
5959 #ifdef SUPPORT_UNICODE
5960 LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206)
5961 LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213)
5962 LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220)
5967 return PCRE2_ERROR_INTERNAL;
5973 /*************************************************
5974 * Match a Regular Expression *
5975 *************************************************/
5977 /* This function applies a compiled pattern to a subject string and picks out
5978 portions of the string if it matches. Two elements in the vector are set for
5979 each substring: the offsets to the start and end of the substring.
5982 code points to the compiled expression
5983 subject points to the subject string
5984 length length of subject string (may contain binary zeros)
5985 start_offset where to start in the subject string
5987 match_data points to a match_data block
5988 mcontext points a PCRE2 context
5990 Returns: > 0 => success; value is the number of ovector pairs filled
5991 = 0 => success, but ovector is not big enough
5992 -1 => failed to match (PCRE2_ERROR_NOMATCH)
5993 -2 => partial match (PCRE2_ERROR_PARTIAL)
5994 < -2 => some kind of unexpected problem
5997 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
5998 pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
5999 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
6000 pcre2_match_context *mcontext)
6003 const uint8_t *start_bits = NULL;
6005 const pcre2_real_code *re = (const pcre2_real_code *)code;
6009 BOOL has_first_cu = FALSE;
6010 BOOL has_req_cu = FALSE;
6014 PCRE2_UCHAR first_cu = 0;
6015 PCRE2_UCHAR first_cu2 = 0;
6016 PCRE2_UCHAR req_cu = 0;
6017 PCRE2_UCHAR req_cu2 = 0;
6019 PCRE2_SPTR bumpalong_limit;
6020 PCRE2_SPTR end_subject;
6021 PCRE2_SPTR start_match = subject + start_offset;
6022 PCRE2_SPTR req_cu_ptr = start_match - 1;
6023 PCRE2_SPTR start_partial = NULL;
6024 PCRE2_SPTR match_partial = NULL;
6026 PCRE2_SIZE frame_size;
6028 /* We need to have mb as a pointer to a match block, because the IS_NEWLINE
6029 macro is used below, and it expects NLBLOCK to be defined as a pointer. */
6031 pcre2_callout_block cb;
6032 match_block actual_match_block;
6033 match_block *mb = &actual_match_block;
6035 /* Allocate an initial vector of backtracking frames on the stack. If this
6036 proves to be too small, it is replaced by a larger one on the heap. To get a
6037 vector of the size required that is aligned for pointers, allocate it as a
6038 vector of pointers. */
6040 PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)];
6041 mb->stack_frames = (heapframe *)stack_frames_vector;
6043 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
6046 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
6047 end_subject = subject + length;
6049 /* Plausibility checks */
6051 if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
6052 if (code == NULL || subject == NULL || match_data == NULL)
6053 return PCRE2_ERROR_NULL;
6054 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
6056 /* Check that the first field in the block is the magic number. */
6058 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
6060 /* Check the code unit width. */
6062 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
6063 return PCRE2_ERROR_BADMODE;
6065 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
6066 options variable for this function. Users of PCRE2 who are not calling the
6067 function directly would like to have a way of setting these flags, in the same
6068 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
6069 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
6070 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now
6071 transfer to the options for this function. The bits are guaranteed to be
6072 adjacent, but do not have the same values. This bit of Boolean trickery assumes
6073 that the match-time bits are not more significant than the flag bits. If by
6074 accident this is not the case, a compile-time division by zero error will
6077 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
6078 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
6079 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
6083 /* These two settings are used in the code for checking a UTF string that
6084 follows immediately afterwards. Other values in the mb block are used only
6085 during interpretive processing, not when the JIT support is in use, so they are
6088 utf = (re->overall_options & PCRE2_UTF) != 0;
6089 mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
6090 ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;
6092 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
6095 if (mb->partial != 0 &&
6096 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
6097 return PCRE2_ERROR_BADOPTION;
6099 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
6100 we must also check that a starting offset does not point into the middle of a
6101 multiunit character. We check only the portion of the subject that is going to
6102 be inspected during matching - from the offset minus the maximum back reference
6103 to the given length. This saves time when a small part of a large subject is
6104 being matched by the use of a starting offset. Note that the maximum lookbehind
6105 is a number of characters, not code units. */
6107 #ifdef SUPPORT_UNICODE
6108 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
6110 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
6112 if (start_offset > 0)
6114 #if PCRE2_CODE_UNIT_WIDTH != 32
6116 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
6117 return PCRE2_ERROR_BADUTFOFFSET;
6118 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
6121 while (check_subject > subject &&
6122 #if PCRE2_CODE_UNIT_WIDTH == 8
6123 (*check_subject & 0xc0) == 0x80)
6125 (*check_subject & 0xfc00) == 0xdc00)
6126 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
6130 /* In the 32-bit library, one code unit equals one character. However,
6131 we cannot just subtract the lookbehind and then compare pointers, because
6132 a very large lookbehind could create an invalid pointer. */
6134 if (start_offset >= re->max_lookbehind)
6135 check_subject -= re->max_lookbehind;
6137 check_subject = subject;
6138 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
6141 /* Validate the relevant portion of the subject. After an error, adjust the
6142 offset to be an absolute offset in the whole string. */
6144 match_data->rc = PRIV(valid_utf)(check_subject,
6145 length - (check_subject - subject), &(match_data->startchar));
6146 if (match_data->rc != 0)
6148 match_data->startchar += check_subject - subject;
6149 return match_data->rc;
6152 #endif /* SUPPORT_UNICODE */
6154 /* It is an error to set an offset limit without setting the flag at compile
6157 if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET &&
6158 (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
6159 return PCRE2_ERROR_BADOFFSETLIMIT;
6161 /* If the pattern was successfully studied with JIT support, run the JIT
6162 executable instead of the rest of this function. Most options must be set at
6163 compile time for the JIT code to be usable. Fallback to the normal code path if
6164 an unsupported option is set or if JIT returns BADOPTION (which means that the
6165 selected normal or partial matching mode was not compiled). */
6168 if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0)
6170 rc = pcre2_jit_match(code, subject, length, start_offset, options,
6171 match_data, mcontext);
6172 if (rc != PCRE2_ERROR_JIT_BADOPTION) return rc;
6176 /* Carry on with non-JIT matching. A NULL match context means "use a default
6177 context", but we take the memory control functions from the pattern. */
6179 if (mcontext == NULL)
6181 mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
6182 mb->memctl = re->memctl;
6184 else mb->memctl = mcontext->memctl;
6186 anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
6187 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
6188 startline = (re->flags & PCRE2_STARTLINE) != 0;
6189 bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
6190 end_subject : subject + mcontext->offset_limit;
6192 /* Initialize and set up the fixed fields in the callout block, with a pointer
6193 in the match block. */
6197 cb.subject = subject;
6198 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
6199 cb.callout_flags = 0;
6201 /* Fill in the remaining fields in the match block. */
6203 mb->callout = mcontext->callout;
6204 mb->callout_data = mcontext->callout_data;
6206 mb->start_subject = subject;
6207 mb->start_offset = start_offset;
6208 mb->end_subject = end_subject;
6209 mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
6211 mb->moptions = options; /* Match options */
6212 mb->poptions = re->overall_options; /* Pattern options */
6214 mb->ignore_skip_arg = 0;
6215 mb->mark = mb->nomatch_mark = NULL; /* In case never set */
6218 /* The name table is needed for finding all the numbers associated with a
6219 given name, for condition testing. The code follows the name table. */
6221 mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
6222 mb->name_count = re->name_count;
6223 mb->name_entry_size = re->name_entry_size;
6224 mb->start_code = mb->name_table + re->name_count * re->name_entry_size;
6226 /* Process the \R and newline settings. */
6228 mb->bsr_convention = re->bsr_convention;
6229 mb->nltype = NLTYPE_FIXED;
6230 switch(re->newline_convention)
6232 case PCRE2_NEWLINE_CR:
6234 mb->nl[0] = CHAR_CR;
6237 case PCRE2_NEWLINE_LF:
6239 mb->nl[0] = CHAR_NL;
6242 case PCRE2_NEWLINE_NUL:
6244 mb->nl[0] = CHAR_NUL;
6247 case PCRE2_NEWLINE_CRLF:
6249 mb->nl[0] = CHAR_CR;
6250 mb->nl[1] = CHAR_NL;
6253 case PCRE2_NEWLINE_ANY:
6254 mb->nltype = NLTYPE_ANY;
6257 case PCRE2_NEWLINE_ANYCRLF:
6258 mb->nltype = NLTYPE_ANYCRLF;
6261 default: return PCRE2_ERROR_INTERNAL;
6264 /* The backtracking frames have fixed data at the front, and a PCRE2_SIZE
6265 vector at the end, whose size depends on the number of capturing parentheses in
6266 the pattern. It is not used at all if there are no capturing parentheses.
6268 frame_size is the total size of each frame
6269 mb->frame_vector_size is the total usable size of the vector (rounded down
6270 to a whole number of frames)
6272 The last of these is changed within the match() function if the frame vector
6273 has to be expanded. We therefore put it into the match block so that it is
6274 correct when calling match() more than once for non-anchored patterns. */
6276 frame_size = offsetof(heapframe, ovector) +
6277 re->top_bracket * 2 * sizeof(PCRE2_SIZE);
6279 /* Limits set in the pattern override the match context only if they are
6282 mb->heap_limit = (mcontext->heap_limit < re->limit_heap)?
6283 mcontext->heap_limit : re->limit_heap;
6285 mb->match_limit = (mcontext->match_limit < re->limit_match)?
6286 mcontext->match_limit : re->limit_match;
6288 mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
6289 mcontext->depth_limit : re->limit_depth;
6291 /* If a pattern has very many capturing parentheses, the frame size may be very
6292 large. Ensure that there are at least 10 available frames by getting an initial
6293 vector on the heap if necessary, except when the heap limit prevents this. Get
6294 fewer if possible. (The heap limit is in kibibytes.) */
6296 if (frame_size <= START_FRAMES_SIZE/10)
6298 mb->match_frames = mb->stack_frames; /* Initial frame vector on the stack */
6299 mb->frame_vector_size = ((START_FRAMES_SIZE/frame_size) * frame_size);
6303 mb->frame_vector_size = frame_size * 10;
6304 if ((mb->frame_vector_size / 1024) > mb->heap_limit)
6306 if (frame_size > mb->heap_limit * 1024) return PCRE2_ERROR_HEAPLIMIT;
6307 mb->frame_vector_size = ((mb->heap_limit * 1024)/frame_size) * frame_size;
6309 mb->match_frames = mb->memctl.malloc(mb->frame_vector_size,
6310 mb->memctl.memory_data);
6311 if (mb->match_frames == NULL) return PCRE2_ERROR_NOMEMORY;
6314 mb->match_frames_top =
6315 (heapframe *)((char *)mb->match_frames + mb->frame_vector_size);
6317 /* Write to the ovector within the first frame to mark every capture unset and
6318 to avoid uninitialized memory read errors when it is copied to a new frame. */
6320 memset((char *)(mb->match_frames) + offsetof(heapframe, ovector), 0xff,
6321 re->top_bracket * 2 * sizeof(PCRE2_SIZE));
6323 /* Pointers to the individual character tables */
6325 mb->lcc = re->tables + lcc_offset;
6326 mb->fcc = re->tables + fcc_offset;
6327 mb->ctypes = re->tables + ctypes_offset;
6329 /* Set up the first code unit to match, if available. If there's no first code
6330 unit there may be a bitmap of possible first characters. */
6332 if ((re->flags & PCRE2_FIRSTSET) != 0)
6334 has_first_cu = TRUE;
6335 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
6336 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
6338 first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
6339 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
6340 if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
6345 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
6346 start_bits = re->start_bitmap;
6348 /* There may also be a "last known required character" set. */
6350 if ((re->flags & PCRE2_LASTSET) != 0)
6353 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
6354 if ((re->flags & PCRE2_LASTCASELESS) != 0)
6356 req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
6357 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
6358 if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
6364 /* ==========================================================================*/
6366 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6367 the loop runs just once. */
6371 PCRE2_SPTR new_start_match;
6373 /* ----------------- Start of match optimizations ---------------- */
6375 /* There are some optimizations that avoid running the match if a known
6376 starting point is not found, or if a known later code unit is not present.
6377 However, there is an option (settable at compile time) that disables these,
6378 for testing and for ensuring that all callouts do actually occur. */
6380 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
6382 /* If firstline is TRUE, the start of the match is constrained to the first
6383 line of a multiline string. That is, the match must be before or at the
6384 first newline following the start of matching. Temporarily adjust
6385 end_subject so that we stop the scans for a first code unit at a newline.
6386 If the match fails at the newline, later code breaks the loop. */
6390 PCRE2_SPTR t = start_match;
6391 #ifdef SUPPORT_UNICODE
6394 while (t < end_subject && !IS_NEWLINE(t))
6397 ACROSSCHAR(t < end_subject, t, t++);
6402 while (t < end_subject && !IS_NEWLINE(t)) t++;
6406 /* Anchored: check the first code unit if one is recorded. This may seem
6407 pointless but it can help in detecting a no match case without scanning for
6408 the required code unit. */
6412 if (has_first_cu || start_bits != NULL)
6414 BOOL ok = start_match < end_subject;
6417 PCRE2_UCHAR c = UCHAR21TEST(start_match);
6418 ok = has_first_cu && (c == first_cu || c == first_cu2);
6419 if (!ok && start_bits != NULL)
6421 #if PCRE2_CODE_UNIT_WIDTH != 8
6422 if (c > 255) c = 255;
6424 ok = (start_bits[c/8] & (1 << (c&7))) != 0;
6435 /* Not anchored. Advance to a unique first code unit if there is one. In
6436 8-bit mode, the use of memchr() gives a big speed up, even though we have
6437 to call it twice in caseless mode, in order to find the earliest occurrence
6438 of the character in either of its cases. */
6444 if (first_cu != first_cu2) /* Caseless */
6446 #if PCRE2_CODE_UNIT_WIDTH != 8
6448 while (start_match < end_subject &&
6449 (smc = UCHAR21TEST(start_match)) != first_cu &&
6452 #else /* 8-bit code units */
6454 memchr(start_match, first_cu, end_subject-start_match);
6456 memchr(start_match, first_cu2, end_subject-start_match);
6458 start_match = (pp2 == NULL)? end_subject : pp2;
6460 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
6464 /* The caseful case */
6468 #if PCRE2_CODE_UNIT_WIDTH != 8
6469 while (start_match < end_subject && UCHAR21TEST(start_match) !=
6473 start_match = memchr(start_match, first_cu, end_subject - start_match);
6474 if (start_match == NULL) start_match = end_subject;
6478 /* If we can't find the required code unit, having reached the true end
6479 of the subject, break the bumpalong loop, to force a match failure,
6480 except when doing partial matching, when we let the next cycle run at
6481 the end of the subject. To see why, consider the pattern /(?<=abc)def/,
6482 which partially matches "abc", even though the string does not contain
6483 the starting character "d". If we have not reached the true end of the
6484 subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
6485 we also let the cycle run, because the matching string is legitimately
6486 allowed to start with the first code unit of a newline. */
6488 if (!mb->partial && start_match >= mb->end_subject)
6495 /* If there's no first code unit, advance to just after a linebreak for a
6496 multiline match if required. */
6500 if (start_match > mb->start_subject + start_offset)
6502 #ifdef SUPPORT_UNICODE
6505 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6508 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
6513 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6516 /* If we have just passed a CR and the newline option is ANY or
6517 ANYCRLF, and we are now at a LF, advance the match position by one
6520 if (start_match[-1] == CHAR_CR &&
6521 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
6522 start_match < end_subject &&
6523 UCHAR21TEST(start_match) == CHAR_NL)
6528 /* If there's no first code unit or a requirement for a multiline line
6529 start, advance to a non-unique first code unit if any have been
6530 identified. The bitmap contains only 256 bits. When code units are 16 or
6531 32 bits wide, all code units greater than 254 set the 255 bit. */
6533 else if (start_bits != NULL)
6535 while (start_match < end_subject)
6537 uint32_t c = UCHAR21TEST(start_match);
6538 #if PCRE2_CODE_UNIT_WIDTH != 8
6539 if (c > 255) c = 255;
6541 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
6545 /* See comment above in first_cu checking about the next few lines. */
6547 if (!mb->partial && start_match >= mb->end_subject)
6553 } /* End first code unit handling */
6555 /* Restore fudged end_subject */
6557 end_subject = mb->end_subject;
6559 /* The following two optimizations must be disabled for partial matching. */
6563 /* The minimum matching length is a lower bound; no string of that length
6564 may actually match the pattern. Although the value is, strictly, in
6565 characters, we treat it as code units to avoid spending too much time in
6566 this optimization. */
6568 if (end_subject - start_match < re->minlength)
6574 /* If req_cu is set, we know that that code unit must appear in the
6575 subject for the (non-partial) match to succeed. If the first code unit is
6576 set, req_cu must be later in the subject; otherwise the test starts at
6577 the match point. This optimization can save a huge amount of backtracking
6578 in patterns with nested unlimited repeats that aren't going to match.
6579 Writing separate code for caseful/caseless versions makes it go faster,
6580 as does using an autoincrement and backing off on a match. As in the case
6581 of the first code unit, using memchr() in the 8-bit library gives a big
6582 speed up. Unlike the first_cu check above, we do not need to call
6583 memchr() twice in the caseless case because we only need to check for the
6584 presence of the character in either case, not find the first occurrence.
6586 HOWEVER: when the subject string is very, very long, searching to its end
6587 can take a long time, and give bad performance on quite ordinary
6588 patterns. This showed up when somebody was matching something like
6589 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
6590 sufficiently long. */
6592 if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
6594 PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
6596 /* We don't need to repeat the search if we haven't yet reached the
6597 place we found it last time round the bumpalong loop. */
6601 if (p < end_subject)
6603 if (req_cu != req_cu2) /* Caseless */
6605 #if PCRE2_CODE_UNIT_WIDTH != 8
6608 uint32_t pp = UCHAR21INCTEST(p);
6609 if (pp == req_cu || pp == req_cu2) { p--; break; }
6611 while (p < end_subject);
6613 #else /* 8-bit code units */
6615 p = memchr(pp, req_cu, end_subject - pp);
6618 p = memchr(pp, req_cu2, end_subject - pp);
6619 if (p == NULL) p = end_subject;
6621 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
6624 /* The caseful case */
6628 #if PCRE2_CODE_UNIT_WIDTH != 8
6631 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
6633 while (p < end_subject);
6635 #else /* 8-bit code units */
6636 p = memchr(p, req_cu, end_subject - p);
6637 if (p == NULL) p = end_subject;
6642 /* If we can't find the required code unit, break the bumpalong loop,
6643 forcing a match failure. */
6645 if (p >= end_subject)
6651 /* If we have found the required code unit, save the point where we
6652 found it, so that we don't search again next time round the bumpalong
6653 loop if the start hasn't yet passed this code unit. */
6661 /* ------------ End of start of match optimizations ------------ */
6663 /* Give no match if we have passed the bumpalong limit. */
6665 if (start_match > bumpalong_limit)
6671 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6672 first starting point for which a partial match was found. */
6674 cb.start_match = (PCRE2_SIZE)(start_match - subject);
6675 cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
6677 mb->start_used_ptr = start_match;
6678 mb->last_used_ptr = start_match;
6679 mb->match_call_count = 0;
6680 mb->end_offset_top = 0;
6681 mb->skip_arg_count = 0;
6683 rc = match(start_match, mb->start_code, match_data->ovector,
6684 match_data->oveccount, re->top_bracket, frame_size, mb);
6686 if (mb->hitend && start_partial == NULL)
6688 start_partial = mb->start_used_ptr;
6689 match_partial = start_match;
6694 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6695 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6696 entirely. The only way we can do that is to re-do the match at the same
6697 point, with a flag to force SKIP with an argument to be ignored. Just
6698 treating this case as NOMATCH does not work because it does not check other
6699 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6701 case MATCH_SKIP_ARG:
6702 new_start_match = start_match;
6703 mb->ignore_skip_arg = mb->skip_arg_count;
6706 /* SKIP passes back the next starting point explicitly, but if it is no
6707 greater than the match we have just done, treat it as NOMATCH. */
6710 if (mb->verb_skip_ptr > start_match)
6712 new_start_match = mb->verb_skip_ptr;
6717 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6718 exactly like PRUNE. Unset ignore SKIP-with-argument. */
6723 mb->ignore_skip_arg = 0;
6724 new_start_match = start_match + 1;
6725 #ifdef SUPPORT_UNICODE
6727 ACROSSCHAR(new_start_match < end_subject, new_start_match,
6732 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6738 /* Any other return is either a match, or some kind of error. */
6744 /* Control reaches here for the various types of "no match at this point"
6745 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6749 /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first
6750 newline in the subject (though it may continue over the newline). Therefore,
6751 if we have just failed to match, starting at a newline, do not continue. */
6753 if (firstline && IS_NEWLINE(start_match)) break;
6755 /* Advance to new matching position */
6757 start_match = new_start_match;
6759 /* Break the loop if the pattern is anchored or if we have passed the end of
6762 if (anchored || start_match > end_subject) break;
6764 /* If we have just passed a CR and we are now at a LF, and the pattern does
6765 not contain any explicit matches for \r or \n, and the newline option is CRLF
6766 or ANY or ANYCRLF, advance the match position by one more code unit. In
6767 normal matching start_match will aways be greater than the first position at
6768 this stage, but a failed *SKIP can cause a return at the same point, which is
6769 why the first test exists. */
6771 if (start_match > subject + start_offset &&
6772 start_match[-1] == CHAR_CR &&
6773 start_match < end_subject &&
6774 *start_match == CHAR_NL &&
6775 (re->flags & PCRE2_HASCRORLF) == 0 &&
6776 (mb->nltype == NLTYPE_ANY ||
6777 mb->nltype == NLTYPE_ANYCRLF ||
6781 mb->mark = NULL; /* Reset for start of next match attempt */
6782 } /* End of for(;;) "bumpalong" loop */
6784 /* ==========================================================================*/
6786 /* When we reach here, one of the following stopping conditions is true:
6788 (1) The match succeeded, either completely, or partially;
6790 (2) The pattern is anchored or the match was failed after (*COMMIT);
6792 (3) We are past the end of the subject or the bumpalong limit;
6794 (4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because
6795 this option requests that a match occur at or before the first newline in
6798 (5) Some kind of error occurred.
6804 /* Release an enlarged frame vector that is on the heap. */
6806 if (mb->match_frames != mb->stack_frames)
6807 mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
6809 /* Fill in fields that are always returned in the match data. */
6811 match_data->code = re;
6812 match_data->subject = subject;
6813 match_data->mark = mb->mark;
6814 match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER;
6816 /* Handle a fully successful match. Set the return code to the number of
6817 captured strings, or 0 if there were too many to fit into the ovector, and then
6818 set the remaining returned values before returning. */
6820 if (rc == MATCH_MATCH)
6822 match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?
6823 0 : (int)mb->end_offset_top/2 + 1;
6824 match_data->startchar = start_match - subject;
6825 match_data->leftchar = mb->start_used_ptr - subject;
6826 match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?
6827 mb->last_used_ptr : mb->end_match_ptr) - subject;
6828 return match_data->rc;
6831 /* Control gets here if there has been a partial match, an error, or if the
6832 overall match attempt has failed at all permitted starting positions. Any mark
6833 data is in the nomatch_mark field. */
6835 match_data->mark = mb->nomatch_mark;
6837 /* For anything other than nomatch or partial match, just return the code. */
6839 if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc;
6841 /* Handle a partial match. */
6843 else if (match_partial != NULL)
6845 match_data->ovector[0] = match_partial - subject;
6846 match_data->ovector[1] = end_subject - subject;
6847 match_data->startchar = match_partial - subject;
6848 match_data->leftchar = start_partial - subject;
6849 match_data->rightchar = end_subject - subject;
6850 match_data->rc = PCRE2_ERROR_PARTIAL;
6853 /* Else this is the classic nomatch case. */
6855 else match_data->rc = PCRE2_ERROR_NOMATCH;
6857 return match_data->rc;
6860 /* End of pcre2_match.c */