1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2018 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
79 #define NLBLOCK mb /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
83 #include "pcre2_internal.h"
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88 PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART)
91 /*************************************************
92 * Code parameters and static tables *
93 *************************************************/
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
100 #define OP_PROP_EXTRA 300
101 #define OP_EXTUNI_EXTRA 320
102 #define OP_ANYNL_EXTRA 340
103 #define OP_HSPACE_EXTRA 360
104 #define OP_VSPACE_EXTRA 380
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
115 static const uint8_t coptable[] = {
117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
128 /* Positive single-char repeats */
129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131 1+IMM2_SIZE, /* exact */
132 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135 1+IMM2_SIZE, /* exact I */
136 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
137 /* Negative single-char repeats - only for chars < 256 */
138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140 1+IMM2_SIZE, /* NOT exact */
141 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144 1+IMM2_SIZE, /* NOT exact I */
145 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
146 /* Positive type repeats */
147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149 1+IMM2_SIZE, /* Type exact */
150 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
151 /* Character class & ref repeats */
152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153 0, 0, /* CRRANGE, CRMINRANGE */
154 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
157 0, /* XCLASS - variable length */
173 0, /* Assert behind */
174 0, /* Assert behind not */
176 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
177 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
178 0, 0, /* CREF, DNCREF */
179 0, 0, /* RREF, DNRREF */
180 0, 0, /* FALSE, TRUE */
181 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
182 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
183 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
184 0, 0, /* COMMIT, COMMIT_ARG */
185 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
186 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
189 /* This table identifies those opcodes that inspect a character. It is used to
190 remember the fact that a character could have been inspected when the end of
191 the subject is reached. ***NOTE*** If the start of this table is modified, the
192 two tables that follow must also be modified. */
194 static const uint8_t poptable[] = {
196 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
197 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
198 1, 1, 1, /* Any, AllAny, Anybyte */
200 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
202 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
207 /* Positive single-char repeats */
208 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
209 1, 1, 1, /* upto, minupto, exact */
210 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
211 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
212 1, 1, 1, /* upto I, minupto I, exact I */
213 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
214 /* Negative single-char repeats - only for chars < 256 */
215 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
216 1, 1, 1, /* NOT upto, minupto, exact */
217 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
218 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
219 1, 1, 1, /* NOT upto I, minupto I, exact I */
220 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
221 /* Positive type repeats */
222 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
223 1, 1, 1, /* Type upto, minupto, exact */
224 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
225 /* Character class & ref repeats */
226 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
227 1, 1, /* CRRANGE, CRMINRANGE */
228 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
231 1, /* XCLASS - variable length */
247 0, /* Assert behind */
248 0, /* Assert behind not */
250 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
251 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
252 0, 0, /* CREF, DNCREF */
253 0, 0, /* RREF, DNRREF */
254 0, 0, /* FALSE, TRUE */
255 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
256 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
257 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
258 0, 0, /* COMMIT, COMMIT_ARG */
259 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
260 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
263 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
266 static const uint8_t toptable1[] = {
268 ctype_digit, ctype_digit,
269 ctype_space, ctype_space,
270 ctype_word, ctype_word,
271 0, 0 /* OP_ANY, OP_ALLANY */
274 static const uint8_t toptable2[] = {
279 1, 1 /* OP_ANY, OP_ALLANY */
283 /* Structure for holding data about a particular state, which is in effect the
284 current data for an active path through the match tree. It must consist
285 entirely of ints because the working vector we are passed, and which we put
286 these structures in, is a vector of ints. */
288 typedef struct stateblock {
289 int offset; /* Offset to opcode (-ve has meaning) */
290 int count; /* Count for repeats */
291 int data; /* Some use extra data */
294 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
297 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
298 local working space and output vectors that were created on the stack. This has
299 caused issues for some patterns, especially in small-stack environments such as
300 Windows. A new scheme is now in use which sets up a vector on the stack, but if
301 this is too small, heap memory is used, up to the heap_limit. The main
302 parameters are all numbers of ints because the workspace is a vector of ints.
304 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
305 defined in pcre2_internal.h so as to be available to pcre2test when it is
306 finding the minimum heap requirement for a match. */
308 #define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int))
310 #define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */
311 #define RWS_RSIZE 1000 /* Work size for recursion */
312 #define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */
313 #define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */
315 /* This structure is at the start of each workspace block. */
317 typedef struct RWS_anchor {
318 struct RWS_anchor *next;
319 unsigned int size; /* Number of ints */
320 unsigned int free; /* Number of ints */
323 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
327 /*************************************************
328 * Process a callout *
329 *************************************************/
331 /* This function is called to perform a callout.
334 code current code pointer
335 offsets points to current capture offsets
336 current_subject start of current subject match
337 ptr current position in subject
339 extracode extra code offset when called from condition
340 lengthptr where to return the callout length
342 Returns: the return from the callout
346 do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
347 PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
348 PCRE2_SIZE *lengthptr)
350 pcre2_callout_block *cb = mb->cb;
352 *lengthptr = (code[extracode] == OP_CALLOUT)?
353 (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
354 (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
356 if (mb->callout == NULL) return 0; /* No callout provided */
358 /* Fixed fields in the callout block are set once and for all at the start of
361 cb->offset_vector = offsets;
362 cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject);
363 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
364 cb->pattern_position = GET(code, 1 + extracode);
365 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
367 if (code[extracode] == OP_CALLOUT)
369 cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
370 cb->callout_string_offset = 0;
371 cb->callout_string = NULL;
372 cb->callout_string_length = 0;
376 cb->callout_number = 0;
377 cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
378 cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
379 cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
382 return (mb->callout)(cb, mb->callout_data);
387 /*************************************************
388 * Expand local workspace memory *
389 *************************************************/
391 /* This function is called when internal_dfa_match() is about to be called
392 recursively and there is insufficient working space left in the current
393 workspace block. If there's an existing next block, use it; otherwise get a new
394 block unless the heap limit is reached.
397 rwsptr pointer to block pointer (updated)
398 ovecsize space needed for an ovector
401 Returns: 0 rwsptr has been updated
406 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
408 RWS_anchor *rws = *rwsptr;
411 if (rws->next != NULL)
416 /* All sizes are in units of sizeof(int), except for mb->heaplimit, which is in
421 unsigned int newsize = rws->size * 2;
422 unsigned int heapleft = (unsigned int)
423 (((1024/sizeof(int))*mb->heap_limit - mb->heap_used));
424 if (newsize > heapleft) newsize = heapleft;
425 if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
426 return PCRE2_ERROR_HEAPLIMIT;
427 new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
428 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
429 mb->heap_used += newsize;
435 new->free = new->size - RWS_ANCHOR_SIZE;
442 /*************************************************
443 * Match a Regular Expression - DFA engine *
444 *************************************************/
446 /* This internal function applies a compiled pattern to a subject string,
447 starting at a given point, using a DFA engine. This function is called from the
448 external one, possibly multiple times if the pattern is not anchored. The
449 function calls itself recursively for some kinds of subpattern.
452 mb the match_data block with fixed information
453 this_start_code the opening bracket of this subexpression's code
454 current_subject where we currently are in the subject string
455 start_offset start offset in the subject string
456 offsets vector to contain the matching string offsets
457 offsetcount size of same
458 workspace vector of workspace
460 rlevel function call recursion level
462 Returns: > 0 => number of match offset pairs placed in offsets
463 = 0 => offsets overflowed; longest matches are present
464 -1 => failed to match
465 < -1 => some kind of unexpected problem
467 The following macros are used for adding states to the two state vectors (one
468 for the current character, one for the following character). */
470 #define ADD_ACTIVE(x,y) \
471 if (active_count++ < wscount) \
473 next_active_state->offset = (x); \
474 next_active_state->count = (y); \
475 next_active_state++; \
477 else return PCRE2_ERROR_DFA_WSSIZE
479 #define ADD_ACTIVE_DATA(x,y,z) \
480 if (active_count++ < wscount) \
482 next_active_state->offset = (x); \
483 next_active_state->count = (y); \
484 next_active_state->data = (z); \
485 next_active_state++; \
487 else return PCRE2_ERROR_DFA_WSSIZE
489 #define ADD_NEW(x,y) \
490 if (new_count++ < wscount) \
492 next_new_state->offset = (x); \
493 next_new_state->count = (y); \
496 else return PCRE2_ERROR_DFA_WSSIZE
498 #define ADD_NEW_DATA(x,y,z) \
499 if (new_count++ < wscount) \
501 next_new_state->offset = (x); \
502 next_new_state->count = (y); \
503 next_new_state->data = (z); \
506 else return PCRE2_ERROR_DFA_WSSIZE
508 /* And now, here is the code */
513 PCRE2_SPTR this_start_code,
514 PCRE2_SPTR current_subject,
515 PCRE2_SIZE start_offset,
517 uint32_t offsetcount,
523 stateblock *active_states, *new_states, *temp_states;
524 stateblock *next_active_state, *next_new_state;
525 const uint8_t *ctypes, *lcc, *fcc;
528 dfa_recursion_info new_recursive;
529 int active_count, new_count, match_count;
531 /* Some fields in the mb block are frequently referenced, so we load them into
532 independent variables in the hope that this will perform better. */
534 PCRE2_SPTR start_subject = mb->start_subject;
535 PCRE2_SPTR end_subject = mb->end_subject;
536 PCRE2_SPTR start_code = mb->start_code;
538 #ifdef SUPPORT_UNICODE
539 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
544 BOOL reset_could_continue = FALSE;
546 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
547 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
548 offsetcount &= (uint32_t)(-2); /* Round down */
551 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
552 (2 * INTS_PER_STATEBLOCK);
554 ctypes = mb->tables + ctypes_offset;
555 lcc = mb->tables + lcc_offset;
556 fcc = mb->tables + fcc_offset;
558 match_count = PCRE2_ERROR_NOMATCH; /* A negative number */
560 active_states = (stateblock *)(workspace + 2);
561 next_new_state = new_states = active_states + wscount;
564 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
565 the alternative states onto the list, and find out where the end is. This
566 makes is possible to use this function recursively, when we want to stop at a
567 matching internal ket rather than at the end.
569 If we are dealing with a backward assertion we have to find out the maximum
570 amount to move back, and set up each alternative appropriately. */
572 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
577 end_code = this_start_code;
580 size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
581 if (back > max_back) max_back = back;
582 end_code += GET(end_code, 1);
584 while (*end_code == OP_ALT);
586 /* If we can't go back the amount required for the longest lookbehind
587 pattern, go back as far as we can; some alternatives may still be viable. */
589 #ifdef SUPPORT_UNICODE
590 /* In character mode we have to step back character by character */
594 for (gone_back = 0; gone_back < max_back; gone_back++)
596 if (current_subject <= start_subject) break;
598 ACROSSCHAR(current_subject > start_subject, current_subject,
605 /* In byte-mode we can do this quickly. */
608 size_t current_offset = (size_t)(current_subject - start_subject);
609 gone_back = (current_offset < max_back)? current_offset : max_back;
610 current_subject -= gone_back;
613 /* Save the earliest consulted character */
615 if (current_subject < mb->start_used_ptr)
616 mb->start_used_ptr = current_subject;
618 /* Now we can process the individual branches. There will be an OP_REVERSE at
619 the start of each branch, except when the length of the branch is zero. */
621 end_code = this_start_code;
624 uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
625 size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
626 if (back <= gone_back)
628 int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
629 ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
631 end_code += GET(end_code, 1);
633 while (*end_code == OP_ALT);
636 /* This is the code for a "normal" subpattern (not a backward assertion). The
637 start of a whole pattern is always one of these. If we are at the top level,
638 we may be asked to restart matching from the same point that we reached for a
639 previous partial match. We still have to scan through the top-level branches to
640 find the end state. */
644 end_code = this_start_code;
648 if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
650 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
651 new_count = workspace[1];
653 memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
660 int length = 1 + LINK_SIZE +
661 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
662 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
666 ADD_NEW((int)(end_code - start_code + length), 0);
667 end_code += GET(end_code, 1);
668 length = 1 + LINK_SIZE;
670 while (*end_code == OP_ALT);
674 workspace[0] = 0; /* Bit indicating which vector is current */
676 /* Loop for scanning the subject */
678 ptr = current_subject;
685 BOOL partial_newline = FALSE;
686 BOOL could_continue = reset_could_continue;
687 reset_could_continue = FALSE;
689 if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
691 /* Make the new state list into the active state list and empty the
694 temp_states = active_states;
695 active_states = new_states;
696 new_states = temp_states;
697 active_count = new_count;
700 workspace[0] ^= 1; /* Remember for the restarting feature */
701 workspace[1] = active_count;
703 /* Set the pointers for adding new states */
705 next_active_state = active_states + active_count;
706 next_new_state = new_states;
708 /* Load the current character from the subject outside the loop, as many
709 different states may want to look at it, and we assume that at least one
712 if (ptr < end_subject)
714 clen = 1; /* Number of data items in the character */
715 #ifdef SUPPORT_UNICODE
716 GETCHARLENTEST(c, ptr, clen);
719 #endif /* SUPPORT_UNICODE */
723 clen = 0; /* This indicates the end of the subject */
724 c = NOTACHAR; /* This value should never actually be used */
727 /* Scan up the active states and act on each one. The result of an action
728 may be to add more states to the currently active list (e.g. on hitting a
729 parenthesis) or it may be to put states on the new list, for considering
730 when we move the character pointer on. */
732 for (i = 0; i < active_count; i++)
734 stateblock *current_state = active_states + i;
735 BOOL caseless = FALSE;
738 int state_offset = current_state->offset;
742 /* A negative offset is a special case meaning "hold off going to this
743 (negated) state until the number of characters in the data field have
744 been skipped". If the could_continue flag was passed over from a previous
745 state, arrange for it to passed on. */
747 if (state_offset < 0)
749 if (current_state->data > 0)
751 ADD_NEW_DATA(state_offset, current_state->count,
752 current_state->data - 1);
753 if (could_continue) reset_could_continue = TRUE;
758 current_state->offset = state_offset = -state_offset;
762 /* Check for a duplicate state with the same count, and skip if found.
763 See the note at the head of this module about the possibility of improving
766 for (j = 0; j < i; j++)
768 if (active_states[j].offset == state_offset &&
769 active_states[j].count == current_state->count)
770 goto NEXT_ACTIVE_STATE;
773 /* The state offset is the offset to the opcode */
775 code = start_code + state_offset;
778 /* If this opcode inspects a character, but we are at the end of the
779 subject, remember the fact for use when testing for a partial match. */
781 if (clen == 0 && poptable[codevalue] != 0)
782 could_continue = TRUE;
784 /* If this opcode is followed by an inline character, load it. It is
785 tempting to test for the presence of a subject character here, but that
786 is wrong, because sometimes zero repetitions of the subject are
789 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
790 argument that is not a data character - but is always one byte long because
791 the values are small. We have to take special action to deal with \P, \p,
792 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
793 these ones to new opcodes. */
795 if (coptable[codevalue] > 0)
798 #ifdef SUPPORT_UNICODE
799 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
800 #endif /* SUPPORT_UNICODE */
801 d = code[coptable[codevalue]];
802 if (codevalue >= OP_TYPESTAR)
806 case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
808 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
809 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
810 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
812 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
814 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
821 dlen = 0; /* Not strictly necessary, but compilers moan */
822 d = NOTACHAR; /* if these variables are not set. */
826 /* Now process the individual opcodes */
830 /* ========================================================================== */
831 /* These cases are never obeyed. This is a fudge that causes a compile-
832 time error if the vectors coptable or poptable, which are indexed by
833 opcode, are not the correct length. It seems to be the only way to do
834 such a check at compile time, as the sizeof() operator does not work
835 in the C preprocessor. */
837 case OP_TABLE_LENGTH:
838 case OP_TABLE_LENGTH +
839 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
840 (sizeof(poptable) == OP_TABLE_LENGTH)):
843 /* ========================================================================== */
844 /* Reached a closing bracket. If not at the end of the pattern, carry
845 on with the next opcode. For repeating opcodes, also add the repeat
846 state. Note that KETRPOS will always be encountered at the end of the
847 subpattern, because the possessive subpattern repeats are always handled
848 using recursive calls. Thus, it never adds any new states.
850 At the end of the (sub)pattern, unless we have an empty string and
851 PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
852 start of the subject, save the match data, shifting up all previous
853 matches so we always have the longest first. */
859 if (code != end_code)
861 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
862 if (codevalue != OP_KET)
864 ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
869 if (ptr > current_subject ||
870 ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
871 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
872 current_subject > start_subject + mb->start_offset)))
874 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
875 else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
877 count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
878 if (count > 0) (void)memmove(offsets + 2, offsets,
879 (size_t)count * sizeof(PCRE2_SIZE));
880 if (offsetcount >= 2)
882 offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
883 offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
885 if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
890 /* ========================================================================== */
891 /* These opcodes add to the current list of states without looking
892 at the current character. */
894 /*-----------------------------------------------------------------*/
896 do { code += GET(code, 1); } while (*code == OP_ALT);
897 ADD_ACTIVE((int)(code - start_code), 0);
900 /*-----------------------------------------------------------------*/
905 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
906 code += GET(code, 1);
908 while (*code == OP_ALT);
911 /*-----------------------------------------------------------------*/
914 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
915 code += GET(code, 1);
916 while (*code == OP_ALT)
918 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
919 code += GET(code, 1);
923 /*-----------------------------------------------------------------*/
926 ADD_ACTIVE(state_offset + 1, 0);
927 code += 1 + GET(code, 2);
928 while (*code == OP_ALT) code += GET(code, 1);
929 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
932 /*-----------------------------------------------------------------*/
934 code += 1 + GET(code, 2);
935 while (*code == OP_ALT) code += GET(code, 1);
936 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
939 /*-----------------------------------------------------------------*/
941 if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
942 { ADD_ACTIVE(state_offset + 1, 0); }
945 /*-----------------------------------------------------------------*/
947 if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
948 ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
949 && WAS_NEWLINE(ptr)))
950 { ADD_ACTIVE(state_offset + 1, 0); }
953 /*-----------------------------------------------------------------*/
955 if (ptr >= end_subject)
957 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
958 could_continue = TRUE;
959 else { ADD_ACTIVE(state_offset + 1, 0); }
963 /*-----------------------------------------------------------------*/
965 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
968 /*-----------------------------------------------------------------*/
970 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
974 /* ========================================================================== */
975 /* These opcodes inspect the next subject character, and sometimes
976 the previous one as well, but do not have an argument. The variable
977 clen contains the length of the current character and is zero if we are
978 at the end of the subject. */
980 /*-----------------------------------------------------------------*/
982 if (clen > 0 && !IS_NEWLINE(ptr))
984 if (ptr + 1 >= mb->end_subject &&
985 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
986 NLBLOCK->nltype == NLTYPE_FIXED &&
987 NLBLOCK->nllen == 2 &&
990 could_continue = partial_newline = TRUE;
994 ADD_NEW(state_offset + 1, 0);
999 /*-----------------------------------------------------------------*/
1002 { ADD_NEW(state_offset + 1, 0); }
1005 /*-----------------------------------------------------------------*/
1007 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1008 could_continue = TRUE;
1009 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1010 { ADD_ACTIVE(state_offset + 1, 0); }
1013 /*-----------------------------------------------------------------*/
1015 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1017 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1018 could_continue = TRUE;
1019 else if (clen == 0 ||
1020 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1021 (ptr == end_subject - mb->nllen)
1023 { ADD_ACTIVE(state_offset + 1, 0); }
1024 else if (ptr + 1 >= mb->end_subject &&
1025 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1026 NLBLOCK->nltype == NLTYPE_FIXED &&
1027 NLBLOCK->nllen == 2 &&
1028 c == NLBLOCK->nl[0])
1030 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1032 reset_could_continue = TRUE;
1033 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1035 else could_continue = partial_newline = TRUE;
1040 /*-----------------------------------------------------------------*/
1042 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1044 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1045 could_continue = TRUE;
1046 else if (clen == 0 ||
1047 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1048 { ADD_ACTIVE(state_offset + 1, 0); }
1049 else if (ptr + 1 >= mb->end_subject &&
1050 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1051 NLBLOCK->nltype == NLTYPE_FIXED &&
1052 NLBLOCK->nllen == 2 &&
1053 c == NLBLOCK->nl[0])
1055 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1057 reset_could_continue = TRUE;
1058 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1060 else could_continue = partial_newline = TRUE;
1063 else if (IS_NEWLINE(ptr))
1064 { ADD_ACTIVE(state_offset + 1, 0); }
1067 /*-----------------------------------------------------------------*/
1072 if (clen > 0 && c < 256 &&
1073 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1074 { ADD_NEW(state_offset + 1, 0); }
1077 /*-----------------------------------------------------------------*/
1079 case OP_NOT_WHITESPACE:
1080 case OP_NOT_WORDCHAR:
1081 if (clen > 0 && (c >= 256 ||
1082 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1083 { ADD_NEW(state_offset + 1, 0); }
1086 /*-----------------------------------------------------------------*/
1087 case OP_WORD_BOUNDARY:
1088 case OP_NOT_WORD_BOUNDARY:
1090 int left_word, right_word;
1092 if (ptr > start_subject)
1094 PCRE2_SPTR temp = ptr - 1;
1095 if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1096 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1097 if (utf) { BACKCHAR(temp); }
1099 GETCHARTEST(d, temp);
1100 #ifdef SUPPORT_UNICODE
1101 if ((mb->poptions & PCRE2_UCP) != 0)
1103 if (d == '_') left_word = TRUE; else
1105 uint32_t cat = UCD_CATEGORY(d);
1106 left_word = (cat == ucp_L || cat == ucp_N);
1111 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1113 else left_word = FALSE;
1117 if (ptr >= mb->last_used_ptr)
1119 PCRE2_SPTR temp = ptr + 1;
1120 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1121 if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1123 mb->last_used_ptr = temp;
1125 #ifdef SUPPORT_UNICODE
1126 if ((mb->poptions & PCRE2_UCP) != 0)
1128 if (c == '_') right_word = TRUE; else
1130 uint32_t cat = UCD_CATEGORY(c);
1131 right_word = (cat == ucp_L || cat == ucp_N);
1136 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1138 else right_word = FALSE;
1140 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1141 { ADD_ACTIVE(state_offset + 1, 0); }
1146 /*-----------------------------------------------------------------*/
1147 /* Check the next character by Unicode property. We will get here only
1148 if the support is in the binary; otherwise a compile-time error occurs.
1151 #ifdef SUPPORT_UNICODE
1158 const ucd_record * prop = GET_UCD(c);
1166 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1167 prop->chartype == ucp_Lt;
1171 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1175 OK = prop->chartype == code[2];
1179 OK = prop->script == code[2];
1182 /* These are specials for combination cases. */
1185 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1186 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1189 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1190 which means that Perl space and POSIX space are now identical. PCRE
1191 was changed at release 8.34. */
1193 case PT_SPACE: /* Perl space */
1194 case PT_PXSPACE: /* POSIX space */
1203 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1209 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1210 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1211 c == CHAR_UNDERSCORE;
1215 cp = PRIV(ucd_caseless_sets) + code[2];
1218 if (c < *cp) { OK = FALSE; break; }
1219 if (c == *cp++) { OK = TRUE; break; }
1224 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1225 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1229 /* Should never occur, but keep compilers from grumbling. */
1232 OK = codevalue != OP_PROP;
1236 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1243 /* ========================================================================== */
1244 /* These opcodes likewise inspect the subject character, but have an
1245 argument that is not a data character. It is one of these opcodes:
1246 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1247 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1250 case OP_TYPEMINPLUS:
1251 case OP_TYPEPOSPLUS:
1252 count = current_state->count; /* Already matched */
1253 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1256 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1257 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1258 NLBLOCK->nltype == NLTYPE_FIXED &&
1259 NLBLOCK->nllen == 2 &&
1260 c == NLBLOCK->nl[0])
1262 could_continue = partial_newline = TRUE;
1264 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1266 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1267 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1269 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1271 active_count--; /* Remove non-match possibility */
1272 next_active_state--;
1275 ADD_NEW(state_offset, count);
1280 /*-----------------------------------------------------------------*/
1282 case OP_TYPEMINQUERY:
1283 case OP_TYPEPOSQUERY:
1284 ADD_ACTIVE(state_offset + 2, 0);
1287 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1288 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1289 NLBLOCK->nltype == NLTYPE_FIXED &&
1290 NLBLOCK->nllen == 2 &&
1291 c == NLBLOCK->nl[0])
1293 could_continue = partial_newline = TRUE;
1295 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1297 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1298 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1300 if (codevalue == OP_TYPEPOSQUERY)
1302 active_count--; /* Remove non-match possibility */
1303 next_active_state--;
1305 ADD_NEW(state_offset + 2, 0);
1310 /*-----------------------------------------------------------------*/
1312 case OP_TYPEMINSTAR:
1313 case OP_TYPEPOSSTAR:
1314 ADD_ACTIVE(state_offset + 2, 0);
1317 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1318 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1319 NLBLOCK->nltype == NLTYPE_FIXED &&
1320 NLBLOCK->nllen == 2 &&
1321 c == NLBLOCK->nl[0])
1323 could_continue = partial_newline = TRUE;
1325 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1327 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1328 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1330 if (codevalue == OP_TYPEPOSSTAR)
1332 active_count--; /* Remove non-match possibility */
1333 next_active_state--;
1335 ADD_NEW(state_offset, 0);
1340 /*-----------------------------------------------------------------*/
1342 count = current_state->count; /* Number already matched */
1345 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1346 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1347 NLBLOCK->nltype == NLTYPE_FIXED &&
1348 NLBLOCK->nllen == 2 &&
1349 c == NLBLOCK->nl[0])
1351 could_continue = partial_newline = TRUE;
1353 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1355 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1356 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1358 if (++count >= (int)GET2(code, 1))
1359 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1361 { ADD_NEW(state_offset, count); }
1366 /*-----------------------------------------------------------------*/
1368 case OP_TYPEMINUPTO:
1369 case OP_TYPEPOSUPTO:
1370 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1371 count = current_state->count; /* Number already matched */
1374 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1375 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1376 NLBLOCK->nltype == NLTYPE_FIXED &&
1377 NLBLOCK->nllen == 2 &&
1378 c == NLBLOCK->nl[0])
1380 could_continue = partial_newline = TRUE;
1382 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1384 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1385 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1387 if (codevalue == OP_TYPEPOSUPTO)
1389 active_count--; /* Remove non-match possibility */
1390 next_active_state--;
1392 if (++count >= (int)GET2(code, 1))
1393 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1395 { ADD_NEW(state_offset, count); }
1400 /* ========================================================================== */
1401 /* These are virtual opcodes that are used when something like
1402 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1403 argument. It keeps the code above fast for the other cases. The argument
1404 is in the d variable. */
1406 #ifdef SUPPORT_UNICODE
1407 case OP_PROP_EXTRA + OP_TYPEPLUS:
1408 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1409 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1410 count = current_state->count; /* Already matched */
1411 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1416 const ucd_record * prop = GET_UCD(c);
1424 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1425 prop->chartype == ucp_Lt;
1429 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1433 OK = prop->chartype == code[3];
1437 OK = prop->script == code[3];
1440 /* These are specials for combination cases. */
1443 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1444 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1447 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1448 which means that Perl space and POSIX space are now identical. PCRE
1449 was changed at release 8.34. */
1451 case PT_SPACE: /* Perl space */
1452 case PT_PXSPACE: /* POSIX space */
1461 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1467 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1468 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1469 c == CHAR_UNDERSCORE;
1473 cp = PRIV(ucd_caseless_sets) + code[3];
1476 if (c < *cp) { OK = FALSE; break; }
1477 if (c == *cp++) { OK = TRUE; break; }
1482 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1483 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1487 /* Should never occur, but keep compilers from grumbling. */
1490 OK = codevalue != OP_PROP;
1494 if (OK == (d == OP_PROP))
1496 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1498 active_count--; /* Remove non-match possibility */
1499 next_active_state--;
1502 ADD_NEW(state_offset, count);
1507 /*-----------------------------------------------------------------*/
1508 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1509 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1510 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1511 count = current_state->count; /* Already matched */
1512 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1516 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1518 active_count--; /* Remove non-match possibility */
1519 next_active_state--;
1521 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1524 ADD_NEW_DATA(-state_offset, count, ncount);
1529 /*-----------------------------------------------------------------*/
1530 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1531 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1532 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1533 count = current_state->count; /* Already matched */
1534 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1546 #endif /* Not EBCDIC */
1547 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1551 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1556 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1558 active_count--; /* Remove non-match possibility */
1559 next_active_state--;
1562 ADD_NEW_DATA(-state_offset, count, ncount);
1571 /*-----------------------------------------------------------------*/
1572 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1573 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1574 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1575 count = current_state->count; /* Already matched */
1576 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1591 if (OK == (d == OP_VSPACE))
1593 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1595 active_count--; /* Remove non-match possibility */
1596 next_active_state--;
1599 ADD_NEW_DATA(-state_offset, count, 0);
1604 /*-----------------------------------------------------------------*/
1605 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1606 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1607 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1608 count = current_state->count; /* Already matched */
1609 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1624 if (OK == (d == OP_HSPACE))
1626 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1628 active_count--; /* Remove non-match possibility */
1629 next_active_state--;
1632 ADD_NEW_DATA(-state_offset, count, 0);
1637 /*-----------------------------------------------------------------*/
1638 #ifdef SUPPORT_UNICODE
1639 case OP_PROP_EXTRA + OP_TYPEQUERY:
1640 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1641 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1645 case OP_PROP_EXTRA + OP_TYPESTAR:
1646 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1647 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1652 ADD_ACTIVE(state_offset + 4, 0);
1657 const ucd_record * prop = GET_UCD(c);
1665 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1666 prop->chartype == ucp_Lt;
1670 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1674 OK = prop->chartype == code[3];
1678 OK = prop->script == code[3];
1681 /* These are specials for combination cases. */
1684 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1685 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1688 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1689 which means that Perl space and POSIX space are now identical. PCRE
1690 was changed at release 8.34. */
1692 case PT_SPACE: /* Perl space */
1693 case PT_PXSPACE: /* POSIX space */
1702 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1708 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1709 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1710 c == CHAR_UNDERSCORE;
1714 cp = PRIV(ucd_caseless_sets) + code[3];
1717 if (c < *cp) { OK = FALSE; break; }
1718 if (c == *cp++) { OK = TRUE; break; }
1723 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1724 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1728 /* Should never occur, but keep compilers from grumbling. */
1731 OK = codevalue != OP_PROP;
1735 if (OK == (d == OP_PROP))
1737 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1738 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1740 active_count--; /* Remove non-match possibility */
1741 next_active_state--;
1743 ADD_NEW(state_offset + count, 0);
1748 /*-----------------------------------------------------------------*/
1749 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1750 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1751 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1755 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1756 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1757 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1762 ADD_ACTIVE(state_offset + 2, 0);
1766 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1767 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1769 active_count--; /* Remove non-match possibility */
1770 next_active_state--;
1772 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1774 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1779 /*-----------------------------------------------------------------*/
1780 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1781 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1782 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1786 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1787 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1788 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1792 ADD_ACTIVE(state_offset + 2, 0);
1804 #endif /* Not EBCDIC */
1805 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1809 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1814 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1815 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1817 active_count--; /* Remove non-match possibility */
1818 next_active_state--;
1820 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1829 /*-----------------------------------------------------------------*/
1830 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1831 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1832 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1836 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1837 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1838 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1842 ADD_ACTIVE(state_offset + 2, 0);
1856 if (OK == (d == OP_VSPACE))
1858 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1859 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1861 active_count--; /* Remove non-match possibility */
1862 next_active_state--;
1864 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1869 /*-----------------------------------------------------------------*/
1870 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1871 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1872 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1876 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1877 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1878 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1882 ADD_ACTIVE(state_offset + 2, 0);
1897 if (OK == (d == OP_HSPACE))
1899 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1900 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1902 active_count--; /* Remove non-match possibility */
1903 next_active_state--;
1905 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1910 /*-----------------------------------------------------------------*/
1911 #ifdef SUPPORT_UNICODE
1912 case OP_PROP_EXTRA + OP_TYPEEXACT:
1913 case OP_PROP_EXTRA + OP_TYPEUPTO:
1914 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1915 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1916 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1917 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1918 count = current_state->count; /* Number already matched */
1923 const ucd_record * prop = GET_UCD(c);
1924 switch(code[1 + IMM2_SIZE + 1])
1931 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1932 prop->chartype == ucp_Lt;
1936 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1940 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1944 OK = prop->script == code[1 + IMM2_SIZE + 2];
1947 /* These are specials for combination cases. */
1950 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1951 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1954 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1955 which means that Perl space and POSIX space are now identical. PCRE
1956 was changed at release 8.34. */
1958 case PT_SPACE: /* Perl space */
1959 case PT_PXSPACE: /* POSIX space */
1968 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1974 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1975 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1976 c == CHAR_UNDERSCORE;
1980 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1983 if (c < *cp) { OK = FALSE; break; }
1984 if (c == *cp++) { OK = TRUE; break; }
1989 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1990 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1994 /* Should never occur, but keep compilers from grumbling. */
1997 OK = codevalue != OP_PROP;
2001 if (OK == (d == OP_PROP))
2003 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2005 active_count--; /* Remove non-match possibility */
2006 next_active_state--;
2008 if (++count >= (int)GET2(code, 1))
2009 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2011 { ADD_NEW(state_offset, count); }
2016 /*-----------------------------------------------------------------*/
2017 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2018 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2019 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2020 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2021 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2022 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2023 count = current_state->count; /* Number already matched */
2028 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2030 active_count--; /* Remove non-match possibility */
2031 next_active_state--;
2033 nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2035 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2036 reset_could_continue = TRUE;
2037 if (++count >= (int)GET2(code, 1))
2038 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2040 { ADD_NEW_DATA(-state_offset, count, ncount); }
2045 /*-----------------------------------------------------------------*/
2046 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2047 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2048 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2049 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2050 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2051 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2052 count = current_state->count; /* Number already matched */
2064 #endif /* Not EBCDIC */
2065 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2069 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2074 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2076 active_count--; /* Remove non-match possibility */
2077 next_active_state--;
2079 if (++count >= (int)GET2(code, 1))
2080 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2082 { ADD_NEW_DATA(-state_offset, count, ncount); }
2091 /*-----------------------------------------------------------------*/
2092 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2093 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2094 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2095 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2096 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2097 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2098 count = current_state->count; /* Number already matched */
2112 if (OK == (d == OP_VSPACE))
2114 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2116 active_count--; /* Remove non-match possibility */
2117 next_active_state--;
2119 if (++count >= (int)GET2(code, 1))
2120 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2122 { ADD_NEW_DATA(-state_offset, count, 0); }
2127 /*-----------------------------------------------------------------*/
2128 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2129 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2130 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2131 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2132 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2133 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2134 count = current_state->count; /* Number already matched */
2149 if (OK == (d == OP_HSPACE))
2151 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2153 active_count--; /* Remove non-match possibility */
2154 next_active_state--;
2156 if (++count >= (int)GET2(code, 1))
2157 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2159 { ADD_NEW_DATA(-state_offset, count, 0); }
2164 /* ========================================================================== */
2165 /* These opcodes are followed by a character that is usually compared
2166 to the current subject character; it is loaded into d. We still get
2167 here even if there is no subject character, because in some cases zero
2168 repetitions are permitted. */
2170 /*-----------------------------------------------------------------*/
2172 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2175 /*-----------------------------------------------------------------*/
2177 if (clen == 0) break;
2179 #ifdef SUPPORT_UNICODE
2182 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2184 unsigned int othercase;
2188 othercase = UCD_OTHERCASE(c);
2189 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2193 #endif /* SUPPORT_UNICODE */
2196 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2197 { ADD_NEW(state_offset + 2, 0); }
2202 #ifdef SUPPORT_UNICODE
2203 /*-----------------------------------------------------------------*/
2204 /* This is a tricky one because it can match more than one character.
2205 Find out how many characters to skip, and then set up a negative state
2206 to wait for them to pass before continuing. */
2212 PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2213 end_subject, utf, &ncount);
2214 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2215 reset_could_continue = TRUE;
2216 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2221 /*-----------------------------------------------------------------*/
2222 /* This is a tricky like EXTUNI because it too can match more than one
2223 character (when CR is followed by LF). In this case, set up a negative
2224 state to wait for one character to pass before continuing. */
2227 if (clen > 0) switch(c)
2235 #endif /* Not EBCDIC */
2236 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2240 ADD_NEW(state_offset + 1, 0);
2244 if (ptr + 1 >= end_subject)
2246 ADD_NEW(state_offset + 1, 0);
2247 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2248 reset_could_continue = TRUE;
2250 else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2252 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2256 ADD_NEW(state_offset + 1, 0);
2262 /*-----------------------------------------------------------------*/
2264 if (clen > 0) switch(c)
2270 ADD_NEW(state_offset + 1, 0);
2275 /*-----------------------------------------------------------------*/
2277 if (clen > 0) switch(c)
2280 ADD_NEW(state_offset + 1, 0);
2288 /*-----------------------------------------------------------------*/
2290 if (clen > 0) switch(c)
2296 ADD_NEW(state_offset + 1, 0);
2301 /*-----------------------------------------------------------------*/
2303 if (clen > 0) switch(c)
2306 ADD_NEW(state_offset + 1, 0);
2314 /*-----------------------------------------------------------------*/
2315 /* Match a negated single character casefully. */
2318 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2321 /*-----------------------------------------------------------------*/
2322 /* Match a negated single character caselessly. */
2328 #ifdef SUPPORT_UNICODE
2329 if (utf && d >= 128)
2330 otherd = UCD_OTHERCASE(d);
2332 #endif /* SUPPORT_UNICODE */
2333 otherd = TABLE_GET(d, fcc, d);
2334 if (c != d && c != otherd)
2335 { ADD_NEW(state_offset + dlen + 1, 0); }
2339 /*-----------------------------------------------------------------*/
2344 case OP_NOTMINPLUSI:
2345 case OP_NOTPOSPLUSI:
2347 codevalue -= OP_STARI - OP_STAR;
2356 count = current_state->count; /* Already matched */
2357 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2360 uint32_t otherd = NOTACHAR;
2363 #ifdef SUPPORT_UNICODE
2364 if (utf && d >= 128)
2365 otherd = UCD_OTHERCASE(d);
2367 #endif /* SUPPORT_UNICODE */
2368 otherd = TABLE_GET(d, fcc, d);
2370 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2373 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2375 active_count--; /* Remove non-match possibility */
2376 next_active_state--;
2379 ADD_NEW(state_offset, count);
2384 /*-----------------------------------------------------------------*/
2389 case OP_NOTMINQUERYI:
2390 case OP_NOTPOSQUERYI:
2392 codevalue -= OP_STARI - OP_STAR;
2398 case OP_NOTMINQUERY:
2399 case OP_NOTPOSQUERY:
2400 ADD_ACTIVE(state_offset + dlen + 1, 0);
2403 uint32_t otherd = NOTACHAR;
2406 #ifdef SUPPORT_UNICODE
2407 if (utf && d >= 128)
2408 otherd = UCD_OTHERCASE(d);
2410 #endif /* SUPPORT_UNICODE */
2411 otherd = TABLE_GET(d, fcc, d);
2413 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2415 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2417 active_count--; /* Remove non-match possibility */
2418 next_active_state--;
2420 ADD_NEW(state_offset + dlen + 1, 0);
2425 /*-----------------------------------------------------------------*/
2430 case OP_NOTMINSTARI:
2431 case OP_NOTPOSSTARI:
2433 codevalue -= OP_STARI - OP_STAR;
2441 ADD_ACTIVE(state_offset + dlen + 1, 0);
2444 uint32_t otherd = NOTACHAR;
2447 #ifdef SUPPORT_UNICODE
2448 if (utf && d >= 128)
2449 otherd = UCD_OTHERCASE(d);
2451 #endif /* SUPPORT_UNICODE */
2452 otherd = TABLE_GET(d, fcc, d);
2454 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2456 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2458 active_count--; /* Remove non-match possibility */
2459 next_active_state--;
2461 ADD_NEW(state_offset, 0);
2466 /*-----------------------------------------------------------------*/
2470 codevalue -= OP_STARI - OP_STAR;
2474 count = current_state->count; /* Number already matched */
2477 uint32_t otherd = NOTACHAR;
2480 #ifdef SUPPORT_UNICODE
2481 if (utf && d >= 128)
2482 otherd = UCD_OTHERCASE(d);
2484 #endif /* SUPPORT_UNICODE */
2485 otherd = TABLE_GET(d, fcc, d);
2487 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2489 if (++count >= (int)GET2(code, 1))
2490 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2492 { ADD_NEW(state_offset, count); }
2497 /*-----------------------------------------------------------------*/
2502 case OP_NOTMINUPTOI:
2503 case OP_NOTPOSUPTOI:
2505 codevalue -= OP_STARI - OP_STAR;
2513 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2514 count = current_state->count; /* Number already matched */
2517 uint32_t otherd = NOTACHAR;
2520 #ifdef SUPPORT_UNICODE
2521 if (utf && d >= 128)
2522 otherd = UCD_OTHERCASE(d);
2524 #endif /* SUPPORT_UNICODE */
2525 otherd = TABLE_GET(d, fcc, d);
2527 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2529 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2531 active_count--; /* Remove non-match possibility */
2532 next_active_state--;
2534 if (++count >= (int)GET2(code, 1))
2535 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2537 { ADD_NEW(state_offset, count); }
2543 /* ========================================================================== */
2544 /* These are the class-handling opcodes */
2550 BOOL isinclass = FALSE;
2551 int next_state_offset;
2554 /* For a simple class, there is always just a 32-byte table, and we
2555 can set isinclass from it. */
2557 if (codevalue != OP_XCLASS)
2559 ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2562 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2563 ((((uint8_t *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2567 /* An extended class may have a table or a list of single characters,
2568 ranges, or both, and it may be positive or negative. There's a
2569 function that sorts all this out. */
2573 ecode = code + GET(code, 1);
2574 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2577 /* At this point, isinclass is set for all kinds of class, and ecode
2578 points to the byte after the end of the class. If there is a
2579 quantifier, this is where it will be. */
2581 next_state_offset = (int)(ecode - start_code);
2588 ADD_ACTIVE(next_state_offset + 1, 0);
2591 if (*ecode == OP_CRPOSSTAR)
2593 active_count--; /* Remove non-match possibility */
2594 next_active_state--;
2596 ADD_NEW(state_offset, 0);
2603 count = current_state->count; /* Already matched */
2604 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2607 if (count > 0 && *ecode == OP_CRPOSPLUS)
2609 active_count--; /* Remove non-match possibility */
2610 next_active_state--;
2613 ADD_NEW(state_offset, count);
2620 ADD_ACTIVE(next_state_offset + 1, 0);
2623 if (*ecode == OP_CRPOSQUERY)
2625 active_count--; /* Remove non-match possibility */
2626 next_active_state--;
2628 ADD_NEW(next_state_offset + 1, 0);
2635 count = current_state->count; /* Already matched */
2636 if (count >= (int)GET2(ecode, 1))
2637 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2640 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2642 if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2644 active_count--; /* Remove non-match possibility */
2645 next_active_state--;
2648 if (++count >= max && max != 0) /* Max 0 => no limit */
2649 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2651 { ADD_NEW(state_offset, count); }
2656 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2662 /* ========================================================================== */
2663 /* These are the opcodes for fancy brackets of various kinds. We have
2664 to use recursion in order to handle them. The "always failing" assertion
2665 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2666 though the other "backtracking verbs" are not supported. */
2669 forced_fail++; /* Count FAILs for multiple states */
2675 case OP_ASSERTBACK_NOT:
2678 int *local_workspace;
2679 PCRE2_SIZE *local_offsets;
2680 PCRE2_SPTR endasscode = code + GET(code, 1);
2681 RWS_anchor *rws = (RWS_anchor *)RWS;
2683 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2685 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2686 if (rc != 0) return rc;
2690 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2691 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2692 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2694 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2696 rc = internal_dfa_match(
2697 mb, /* static match data */
2698 code, /* this subexpression's code */
2699 ptr, /* where we currently are */
2700 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2701 local_offsets, /* offset vector */
2702 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2703 local_workspace, /* workspace vector */
2704 RWS_RSIZE, /* size of same */
2705 rlevel, /* function recursion level */
2706 RWS); /* recursion workspace */
2708 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2710 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2711 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2712 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2716 /*-----------------------------------------------------------------*/
2720 int codelink = (int)GET(code, 1);
2721 PCRE2_UCHAR condcode;
2723 /* Because of the way auto-callout works during compile, a callout item
2724 is inserted between OP_COND and an assertion condition. This does not
2725 happen for the other conditions. */
2727 if (code[LINK_SIZE + 1] == OP_CALLOUT
2728 || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2730 PCRE2_SIZE callout_length;
2731 rrc = do_callout(code, offsets, current_subject, ptr, mb,
2732 1 + LINK_SIZE, &callout_length);
2733 if (rrc < 0) return rrc; /* Abandon */
2734 if (rrc > 0) break; /* Fail this thread */
2735 code += callout_length; /* Skip callout data */
2738 condcode = code[LINK_SIZE+1];
2740 /* Back reference conditions and duplicate named recursion conditions
2741 are not supported */
2743 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2744 condcode == OP_DNRREF)
2745 return PCRE2_ERROR_DFA_UCOND;
2747 /* The DEFINE condition is always false, and the assertion (?!) is
2748 converted to OP_FAIL. */
2750 if (condcode == OP_FALSE || condcode == OP_FAIL)
2751 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2753 /* There is also an always-true condition */
2755 else if (condcode == OP_TRUE)
2756 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2758 /* The only supported version of OP_RREF is for the value RREF_ANY,
2759 which means "test if in any recursion". We can't test for specifically
2762 else if (condcode == OP_RREF)
2764 unsigned int value = GET2(code, LINK_SIZE + 2);
2765 if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2766 if (mb->recursive != NULL)
2767 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2768 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2771 /* Otherwise, the condition is an assertion */
2776 int *local_workspace;
2777 PCRE2_SIZE *local_offsets;
2778 PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2779 PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2780 RWS_anchor *rws = (RWS_anchor *)RWS;
2782 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2784 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2785 if (rc != 0) return rc;
2789 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2790 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2791 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2793 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2795 rc = internal_dfa_match(
2796 mb, /* fixed match data */
2797 asscode, /* this subexpression's code */
2798 ptr, /* where we currently are */
2799 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2800 local_offsets, /* offset vector */
2801 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2802 local_workspace, /* workspace vector */
2803 RWS_RSIZE, /* size of same */
2804 rlevel, /* function recursion level */
2805 RWS); /* recursion workspace */
2807 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2809 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2811 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2812 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2814 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2819 /*-----------------------------------------------------------------*/
2823 int *local_workspace;
2824 PCRE2_SIZE *local_offsets;
2825 RWS_anchor *rws = (RWS_anchor *)RWS;
2826 dfa_recursion_info *ri;
2827 PCRE2_SPTR callpat = start_code + GET(code, 1);
2828 uint32_t recno = (callpat == mb->start_code)? 0 :
2829 GET2(callpat, 1 + LINK_SIZE);
2831 if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2833 rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2834 if (rc != 0) return rc;
2838 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2839 local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2840 rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2842 /* Check for repeating a recursion without advancing the subject
2843 pointer. This should catch convoluted mutual recursions. (Some simple
2844 cases are caught at compile time.) */
2846 for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2847 if (recno == ri->group_num && ptr == ri->subject_position)
2848 return PCRE2_ERROR_RECURSELOOP;
2850 /* Remember this recursion and where we started it so as to
2851 catch infinite loops. */
2853 new_recursive.group_num = recno;
2854 new_recursive.subject_position = ptr;
2855 new_recursive.prevrec = mb->recursive;
2856 mb->recursive = &new_recursive;
2858 rc = internal_dfa_match(
2859 mb, /* fixed match data */
2860 callpat, /* this subexpression's code */
2861 ptr, /* where we currently are */
2862 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2863 local_offsets, /* offset vector */
2864 RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */
2865 local_workspace, /* workspace vector */
2866 RWS_RSIZE, /* size of same */
2867 rlevel, /* function recursion level */
2868 RWS); /* recursion workspace */
2870 rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2871 mb->recursive = new_recursive.prevrec; /* Done this recursion */
2873 /* Ran out of internal offsets */
2875 if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2877 /* For each successful matched substring, set up the next state with a
2878 count of characters to skip before trying it. Note that the count is in
2879 characters, not bytes. */
2883 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2885 PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2886 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2889 PCRE2_SPTR p = start_subject + local_offsets[rc];
2890 PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2891 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2896 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2897 (int)(charcount - 1));
2901 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2905 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2909 /*-----------------------------------------------------------------*/
2917 int *local_workspace;
2918 PCRE2_SIZE *local_offsets;
2919 PCRE2_SIZE charcount, matched_count;
2920 PCRE2_SPTR local_ptr = ptr;
2921 RWS_anchor *rws = (RWS_anchor *)RWS;
2924 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2926 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2927 if (rc != 0) return rc;
2931 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2932 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2933 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2935 if (codevalue == OP_BRAPOSZERO)
2938 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2940 else allow_zero = FALSE;
2942 /* Loop to match the subpattern as many times as possible as if it were
2943 a complete pattern. */
2945 for (matched_count = 0;; matched_count++)
2947 rc = internal_dfa_match(
2948 mb, /* fixed match data */
2949 code, /* this subexpression's code */
2950 local_ptr, /* where we currently are */
2951 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2952 local_offsets, /* offset vector */
2953 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2954 local_workspace, /* workspace vector */
2955 RWS_RSIZE, /* size of same */
2956 rlevel, /* function recursion level */
2957 RWS); /* recursion workspace */
2959 /* Failed to match */
2963 if (rc != PCRE2_ERROR_NOMATCH) return rc;
2967 /* Matched: break the loop if zero characters matched. */
2969 charcount = local_offsets[1] - local_offsets[0];
2970 if (charcount == 0) break;
2971 local_ptr += charcount; /* Advance temporary position ptr */
2974 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2976 /* At this point we have matched the subpattern matched_count
2977 times, and local_ptr is pointing to the character after the end of the
2980 if (matched_count > 0 || allow_zero)
2982 PCRE2_SPTR end_subpattern = code;
2983 int next_state_offset;
2985 do { end_subpattern += GET(end_subpattern, 1); }
2986 while (*end_subpattern == OP_ALT);
2988 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2990 /* Optimization: if there are no more active states, and there
2991 are no new states yet set up, then skip over the subject string
2992 right here, to save looping. Otherwise, set up the new state to swing
2993 into action when the end of the matched substring is reached. */
2995 if (i + 1 >= active_count && new_count == 0)
2999 ADD_NEW(next_state_offset, 0);
3004 PCRE2_SPTR pp = local_ptr;
3005 charcount = (PCRE2_SIZE)(pp - p);
3006 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3007 if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3009 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3015 /*-----------------------------------------------------------------*/
3019 int *local_workspace;
3020 PCRE2_SIZE *local_offsets;
3021 RWS_anchor *rws = (RWS_anchor *)RWS;
3023 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3025 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3026 if (rc != 0) return rc;
3030 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3031 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3032 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3034 rc = internal_dfa_match(
3035 mb, /* fixed match data */
3036 code, /* this subexpression's code */
3037 ptr, /* where we currently are */
3038 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
3039 local_offsets, /* offset vector */
3040 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3041 local_workspace, /* workspace vector */
3042 RWS_RSIZE, /* size of same */
3043 rlevel, /* function recursion level */
3044 RWS); /* recursion workspace */
3046 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3050 PCRE2_SPTR end_subpattern = code;
3051 PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3052 int next_state_offset, repeat_state_offset;
3054 do { end_subpattern += GET(end_subpattern, 1); }
3055 while (*end_subpattern == OP_ALT);
3057 (int)(end_subpattern - start_code + LINK_SIZE + 1);
3059 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3060 arrange for the repeat state also to be added to the relevant list.
3061 Calculate the offset, or set -1 for no repeat. */
3063 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3064 *end_subpattern == OP_KETRMIN)?
3065 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3067 /* If we have matched an empty string, add the next state at the
3068 current character pointer. This is important so that the duplicate
3069 checking kicks in, which is what breaks infinite loops that match an
3074 ADD_ACTIVE(next_state_offset, 0);
3077 /* Optimization: if there are no more active states, and there
3078 are no new states yet set up, then skip over the subject string
3079 right here, to save looping. Otherwise, set up the new state to swing
3080 into action when the end of the matched substring is reached. */
3082 else if (i + 1 >= active_count && new_count == 0)
3086 ADD_NEW(next_state_offset, 0);
3088 /* If we are adding a repeat state at the new character position,
3089 we must fudge things so that it is the only current state.
3090 Otherwise, it might be a duplicate of one we processed before, and
3091 that would cause it to be skipped. */
3093 if (repeat_state_offset >= 0)
3095 next_active_state = active_states;
3098 ADD_ACTIVE(repeat_state_offset, 0);
3103 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3106 PCRE2_SPTR p = start_subject + local_offsets[0];
3107 PCRE2_SPTR pp = start_subject + local_offsets[1];
3108 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3111 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3112 if (repeat_state_offset >= 0)
3113 { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3116 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3121 /* ========================================================================== */
3122 /* Handle callouts */
3125 case OP_CALLOUT_STR:
3127 PCRE2_SIZE callout_length;
3128 rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
3130 if (rrc < 0) return rrc; /* Abandon */
3132 { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3137 /* ========================================================================== */
3138 default: /* Unsupported opcode */
3139 return PCRE2_ERROR_DFA_UITEM;
3142 NEXT_ACTIVE_STATE: continue;
3144 } /* End of loop scanning active states */
3146 /* We have finished the processing at the current subject character. If no
3147 new states have been set for the next character, we have found all the
3148 matches that we are going to find. If we are at the top level and partial
3149 matching has been requested, check for appropriate conditions.
3151 The "forced_ fail" variable counts the number of (*F) encountered for the
3152 character. If it is equal to the original active_count (saved in
3153 workspace[1]) it means that (*F) was found on every active state. In this
3154 case we don't want to give a partial match.
3156 The "could_continue" variable is true if a state could have continued but
3157 for the fact that the end of the subject was reached. */
3161 if (rlevel == 1 && /* Top level, and */
3162 could_continue && /* Some could go on, and */
3163 forced_fail != workspace[1] && /* Not all forced fail & */
3165 (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */
3167 ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */
3168 match_count < 0) /* no matches */
3171 partial_newline || /* Either partial NL */
3173 ptr >= end_subject && /* End of subject and */
3174 ptr > mb->start_used_ptr) /* Inspected non-empty string */
3177 match_count = PCRE2_ERROR_PARTIAL;
3178 break; /* Exit from loop along the subject string */
3181 /* One or more states are active for the next character. */
3183 ptr += clen; /* Advance to next subject character */
3184 } /* Loop to move along the subject string */
3186 /* Control gets here from "break" a few lines above. If we have a match and
3187 PCRE2_ENDANCHORED is set, the match fails. */
3189 if (match_count >= 0 &&
3190 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3192 match_count = PCRE2_ERROR_NOMATCH;
3199 /*************************************************
3200 * Match a pattern using the DFA algorithm *
3201 *************************************************/
3203 /* This function matches a compiled pattern to a subject string, using the
3204 alternate matching algorithm that finds all matches at once.
3207 code points to the compiled pattern
3208 subject subject string
3209 length length of subject string
3210 startoffset where to start matching in the subject
3212 match_data points to a match data structure
3213 gcontext points to a match context
3214 workspace pointer to workspace
3215 wscount size of workspace
3217 Returns: > 0 => number of match offset pairs placed in offsets
3218 = 0 => offsets overflowed; longest matches are present
3219 -1 => failed to match
3220 < -1 => some kind of unexpected problem
3223 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
3224 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3225 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3226 pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3229 const pcre2_real_code *re = (const pcre2_real_code *)code;
3231 PCRE2_SPTR start_match;
3232 PCRE2_SPTR end_subject;
3233 PCRE2_SPTR bumpalong_limit;
3234 PCRE2_SPTR req_cu_ptr;
3236 BOOL utf, anchored, startline, firstline;
3237 BOOL has_first_cu = FALSE;
3238 BOOL has_req_cu = FALSE;
3240 PCRE2_UCHAR first_cu = 0;
3241 PCRE2_UCHAR first_cu2 = 0;
3242 PCRE2_UCHAR req_cu = 0;
3243 PCRE2_UCHAR req_cu2 = 0;
3245 const uint8_t *start_bits = NULL;
3247 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3248 is used below, and it expects NLBLOCK to be defined as a pointer. */
3250 pcre2_callout_block cb;
3251 dfa_match_block actual_match_block;
3252 dfa_match_block *mb = &actual_match_block;
3254 /* Set up a starting block of memory for use during recursive calls to
3255 internal_dfa_match(). By putting this on the stack, it minimizes resource use
3256 in the case when it is not needed. If this is too small, more memory is
3257 obtained from the heap. At the start of each block is an anchor structure.*/
3259 int base_recursion_workspace[RWS_BASE_SIZE];
3260 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3262 rws->size = RWS_BASE_SIZE;
3263 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3265 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3268 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
3270 /* Plausibility checks */
3272 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3273 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3274 return PCRE2_ERROR_NULL;
3275 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3276 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3278 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3281 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3282 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3283 return PCRE2_ERROR_BADOPTION;
3285 /* Check that the first field in the block is the magic number. If it is not,
3286 return with PCRE2_ERROR_BADMAGIC. */
3288 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3290 /* Check the code unit width. */
3292 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3293 return PCRE2_ERROR_BADMODE;
3295 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3296 options variable for this function. Users of PCRE2 who are not calling the
3297 function directly would like to have a way of setting these flags, in the same
3298 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3299 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3300 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3301 transferred to the options for this function. The bits are guaranteed to be
3302 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3303 that the match-time bits are not more significant than the flag bits. If by
3304 accident this is not the case, a compile-time division by zero error will
3307 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3308 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3309 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3313 /* If restarting after a partial match, do some sanity checks on the contents
3314 of the workspace. */
3316 if ((options & PCRE2_DFA_RESTART) != 0)
3318 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3319 workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3320 return PCRE2_ERROR_DFA_BADRESTART;
3323 /* Set some local values */
3325 utf = (re->overall_options & PCRE2_UTF) != 0;
3326 start_match = subject + start_offset;
3327 end_subject = subject + length;
3328 req_cu_ptr = start_match - 1;
3329 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3330 (re->overall_options & PCRE2_ANCHORED) != 0;
3332 /* The "must be at the start of a line" flags are used in a loop when finding
3335 startline = (re->flags & PCRE2_STARTLINE) != 0;
3336 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3337 bumpalong_limit = end_subject;
3339 /* Initialize and set up the fixed fields in the callout block, with a pointer
3340 in the match block. */
3344 cb.subject = subject;
3345 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3346 cb.callout_flags = 0;
3347 cb.capture_top = 1; /* No capture support */
3348 cb.capture_last = 0;
3349 cb.mark = NULL; /* No (*MARK) support */
3351 /* Get data from the match context, if present, and fill in the remaining
3352 fields in the match block. It is an error to set an offset limit without
3353 setting the flag at compile time. */
3355 if (mcontext == NULL)
3358 mb->memctl = re->memctl;
3359 mb->match_limit = PRIV(default_match_context).match_limit;
3360 mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3361 mb->heap_limit = PRIV(default_match_context).heap_limit;
3365 if (mcontext->offset_limit != PCRE2_UNSET)
3367 if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3368 return PCRE2_ERROR_BADOFFSETLIMIT;
3369 bumpalong_limit = subject + mcontext->offset_limit;
3371 mb->callout = mcontext->callout;
3372 mb->callout_data = mcontext->callout_data;
3373 mb->memctl = mcontext->memctl;
3374 mb->match_limit = mcontext->match_limit;
3375 mb->match_limit_depth = mcontext->depth_limit;
3376 mb->heap_limit = mcontext->heap_limit;
3379 if (mb->match_limit > re->limit_match)
3380 mb->match_limit = re->limit_match;
3382 if (mb->match_limit_depth > re->limit_depth)
3383 mb->match_limit_depth = re->limit_depth;
3385 if (mb->heap_limit > re->limit_heap)
3386 mb->heap_limit = re->limit_heap;
3388 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3389 re->name_count * re->name_entry_size;
3390 mb->tables = re->tables;
3391 mb->start_subject = subject;
3392 mb->end_subject = end_subject;
3393 mb->start_offset = start_offset;
3394 mb->moptions = options;
3395 mb->poptions = re->overall_options;
3396 mb->match_call_count = 0;
3399 /* Process the \R and newline settings. */
3401 mb->bsr_convention = re->bsr_convention;
3402 mb->nltype = NLTYPE_FIXED;
3403 switch(re->newline_convention)
3405 case PCRE2_NEWLINE_CR:
3407 mb->nl[0] = CHAR_CR;
3410 case PCRE2_NEWLINE_LF:
3412 mb->nl[0] = CHAR_NL;
3415 case PCRE2_NEWLINE_NUL:
3417 mb->nl[0] = CHAR_NUL;
3420 case PCRE2_NEWLINE_CRLF:
3422 mb->nl[0] = CHAR_CR;
3423 mb->nl[1] = CHAR_NL;
3426 case PCRE2_NEWLINE_ANY:
3427 mb->nltype = NLTYPE_ANY;
3430 case PCRE2_NEWLINE_ANYCRLF:
3431 mb->nltype = NLTYPE_ANYCRLF;
3434 default: return PCRE2_ERROR_INTERNAL;
3437 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3438 we must also check that a starting offset does not point into the middle of a
3439 multiunit character. We check only the portion of the subject that is going to
3440 be inspected during matching - from the offset minus the maximum back reference
3441 to the given length. This saves time when a small part of a large subject is
3442 being matched by the use of a starting offset. Note that the maximum lookbehind
3443 is a number of characters, not code units. */
3445 #ifdef SUPPORT_UNICODE
3446 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3448 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
3450 if (start_offset > 0)
3452 #if PCRE2_CODE_UNIT_WIDTH != 32
3454 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3455 return PCRE2_ERROR_BADUTFOFFSET;
3456 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3459 while (check_subject > subject &&
3460 #if PCRE2_CODE_UNIT_WIDTH == 8
3461 (*check_subject & 0xc0) == 0x80)
3463 (*check_subject & 0xfc00) == 0xdc00)
3464 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3467 #else /* In the 32-bit library, one code unit equals one character. */
3468 check_subject -= re->max_lookbehind;
3469 if (check_subject < subject) check_subject = subject;
3470 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
3473 /* Validate the relevant portion of the subject. After an error, adjust the
3474 offset to be an absolute offset in the whole string. */
3476 match_data->rc = PRIV(valid_utf)(check_subject,
3477 length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3478 if (match_data->rc != 0)
3480 match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3481 return match_data->rc;
3484 #endif /* SUPPORT_UNICODE */
3486 /* Set up the first code unit to match, if available. If there's no first code
3487 unit there may be a bitmap of possible first characters. */
3489 if ((re->flags & PCRE2_FIRSTSET) != 0)
3491 has_first_cu = TRUE;
3492 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3493 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3495 first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3496 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3497 if (utf && first_cu > 127)
3498 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3503 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3504 start_bits = re->start_bitmap;
3506 /* There may be a "last known required code unit" set. */
3508 if ((re->flags & PCRE2_LASTSET) != 0)
3511 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3512 if ((re->flags & PCRE2_LASTCASELESS) != 0)
3514 req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3515 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3516 if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3521 /* Fill in fields that are always returned in the match data. */
3523 match_data->code = re;
3524 match_data->subject = subject;
3525 match_data->mark = NULL;
3526 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3528 /* Call the main matching function, looping for a non-anchored regex after a
3529 failed match. If not restarting, perform certain optimizations at the start of
3534 /* ----------------- Start of match optimizations ---------------- */
3536 /* There are some optimizations that avoid running the match if a known
3537 starting point is not found, or if a known later code unit is not present.
3538 However, there is an option (settable at compile time) that disables
3539 these, for testing and for ensuring that all callouts do actually occur.
3540 The optimizations must also be avoided when restarting a DFA match. */
3542 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3543 (options & PCRE2_DFA_RESTART) == 0)
3545 /* If firstline is TRUE, the start of the match is constrained to the first
3546 line of a multiline string. That is, the match must be before or at the
3547 first newline following the start of matching. Temporarily adjust
3548 end_subject so that we stop the optimization scans for a first code unit
3549 immediately after the first character of a newline (the first code unit can
3550 legitimately be a newline). If the match fails at the newline, later code
3551 breaks this loop. */
3555 PCRE2_SPTR t = start_match;
3556 #ifdef SUPPORT_UNICODE
3559 while (t < end_subject && !IS_NEWLINE(t))
3562 ACROSSCHAR(t < end_subject, t, t++);
3567 while (t < end_subject && !IS_NEWLINE(t)) t++;
3571 /* Anchored: check the first code unit if one is recorded. This may seem
3572 pointless but it can help in detecting a no match case without scanning for
3573 the required code unit. */
3577 if (has_first_cu || start_bits != NULL)
3579 BOOL ok = start_match < end_subject;
3582 PCRE2_UCHAR c = UCHAR21TEST(start_match);
3583 ok = has_first_cu && (c == first_cu || c == first_cu2);
3584 if (!ok && start_bits != NULL)
3586 #if PCRE2_CODE_UNIT_WIDTH != 8
3587 if (c > 255) c = 255;
3589 ok = (start_bits[c/8] & (1 << (c&7))) != 0;
3596 /* Not anchored. Advance to a unique first code unit if there is one. In
3597 8-bit mode, the use of memchr() gives a big speed up, even though we have
3598 to call it twice in caseless mode, in order to find the earliest occurrence
3599 of the character in either of its cases. */
3605 if (first_cu != first_cu2) /* Caseless */
3607 #if PCRE2_CODE_UNIT_WIDTH != 8
3609 while (start_match < end_subject &&
3610 (smc = UCHAR21TEST(start_match)) != first_cu &&
3613 #else /* 8-bit code units */
3615 memchr(start_match, first_cu, end_subject-start_match);
3617 memchr(start_match, first_cu2, end_subject-start_match);
3619 start_match = (pp2 == NULL)? end_subject : pp2;
3621 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3625 /* The caseful case */
3629 #if PCRE2_CODE_UNIT_WIDTH != 8
3630 while (start_match < end_subject && UCHAR21TEST(start_match) !=
3634 start_match = memchr(start_match, first_cu, end_subject - start_match);
3635 if (start_match == NULL) start_match = end_subject;
3639 /* If we can't find the required code unit, having reached the true end
3640 of the subject, break the bumpalong loop, to force a match failure,
3641 except when doing partial matching, when we let the next cycle run at
3642 the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3643 which partially matches "abc", even though the string does not contain
3644 the starting character "d". If we have not reached the true end of the
3645 subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3646 we also let the cycle run, because the matching string is legitimately
3647 allowed to start with the first code unit of a newline. */
3649 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3650 start_match >= mb->end_subject)
3654 /* If there's no first code unit, advance to just after a linebreak for a
3655 multiline match if required. */
3659 if (start_match > mb->start_subject + start_offset)
3661 #ifdef SUPPORT_UNICODE
3664 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3667 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3672 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3675 /* If we have just passed a CR and the newline option is ANY or
3676 ANYCRLF, and we are now at a LF, advance the match position by one
3679 if (start_match[-1] == CHAR_CR &&
3680 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3681 start_match < end_subject &&
3682 UCHAR21TEST(start_match) == CHAR_NL)
3687 /* If there's no first code unit or a requirement for a multiline line
3688 start, advance to a non-unique first code unit if any have been
3689 identified. The bitmap contains only 256 bits. When code units are 16 or
3690 32 bits wide, all code units greater than 254 set the 255 bit. */
3692 else if (start_bits != NULL)
3694 while (start_match < end_subject)
3696 uint32_t c = UCHAR21TEST(start_match);
3697 #if PCRE2_CODE_UNIT_WIDTH != 8
3698 if (c > 255) c = 255;
3700 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
3704 /* See comment above in first_cu checking about the next line. */
3706 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3707 start_match >= mb->end_subject)
3710 } /* End of first code unit handling */
3712 /* Restore fudged end_subject */
3714 end_subject = mb->end_subject;
3716 /* The following two optimizations are disabled for partial matching. */
3718 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3720 /* The minimum matching length is a lower bound; no actual string of that
3721 length may actually match the pattern. Although the value is, strictly,
3722 in characters, we treat it as code units to avoid spending too much time
3723 in this optimization. */
3725 if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3727 /* If req_cu is set, we know that that code unit must appear in the
3728 subject for the match to succeed. If the first code unit is set, req_cu
3729 must be later in the subject; otherwise the test starts at the match
3730 point. This optimization can save a huge amount of backtracking in
3731 patterns with nested unlimited repeats that aren't going to match.
3732 Writing separate code for cased/caseless versions makes it go faster, as
3733 does using an autoincrement and backing off on a match.
3735 HOWEVER: when the subject string is very, very long, searching to its end
3736 can take a long time, and give bad performance on quite ordinary
3737 patterns. This showed up when somebody was matching something like
3738 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3739 sufficiently long. */
3741 if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
3743 PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
3745 /* We don't need to repeat the search if we haven't yet reached the
3746 place we found it at last time. */
3750 if (req_cu != req_cu2)
3752 while (p < end_subject)
3754 uint32_t pp = UCHAR21INCTEST(p);
3755 if (pp == req_cu || pp == req_cu2) { p--; break; }
3760 while (p < end_subject)
3762 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3766 /* If we can't find the required code unit, break the matching loop,
3767 forcing a match failure. */
3769 if (p >= end_subject) break;
3771 /* If we have found the required code unit, save the point where we
3772 found it, so that we don't search again next time round the loop if
3773 the start hasn't passed this code unit yet. */
3781 /* ------------ End of start of match optimizations ------------ */
3783 /* Give no match if we have passed the bumpalong limit. */
3785 if (start_match > bumpalong_limit) break;
3787 /* OK, now we can do the business */
3789 mb->start_used_ptr = start_match;
3790 mb->last_used_ptr = start_match;
3791 mb->recursive = NULL;
3793 rc = internal_dfa_match(
3794 mb, /* fixed match data */
3795 mb->start_code, /* this subexpression's code */
3796 start_match, /* where we currently are */
3797 start_offset, /* start offset in subject */
3798 match_data->ovector, /* offset vector */
3799 (uint32_t)match_data->oveccount * 2, /* actual size of same */
3800 workspace, /* workspace vector */
3801 (int)wscount, /* size of same */
3802 0, /* function recurse level */
3803 base_recursion_workspace); /* initial workspace for recursion */
3805 /* Anything other than "no match" means we are done, always; otherwise, carry
3806 on only if not anchored. */
3808 if (rc != PCRE2_ERROR_NOMATCH || anchored)
3810 if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3812 match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3813 match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3815 match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3816 match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3817 match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3818 match_data->rc = rc;
3822 /* Advance to the next subject character unless we are at the end of a line
3823 and firstline is set. */
3825 if (firstline && IS_NEWLINE(start_match)) break;
3827 #ifdef SUPPORT_UNICODE
3830 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3833 if (start_match > end_subject) break;
3835 /* If we have just passed a CR and we are now at a LF, and the pattern does
3836 not contain any explicit matches for \r or \n, and the newline option is CRLF
3837 or ANY or ANYCRLF, advance the match position by one more character. */
3839 if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3840 start_match < end_subject &&
3841 UCHAR21TEST(start_match) == CHAR_NL &&
3842 (re->flags & PCRE2_HASCRORLF) == 0 &&
3843 (mb->nltype == NLTYPE_ANY ||
3844 mb->nltype == NLTYPE_ANYCRLF ||
3848 } /* "Bumpalong" loop */
3851 rc = PCRE2_ERROR_NOMATCH;
3854 while (rws->next != NULL)
3856 RWS_anchor *next = rws->next;
3857 rws->next = next->next;
3858 mb->memctl.free(next, mb->memctl.memory_data);
3864 /* End of pcre2_dfa_match.c */