pcre2_dfa_match.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9      Original API code Copyright (c) 1997-2012 University of Cambridge
  10           New API code Copyright (c) 2016-2018 University of Cambridge
  11
  12 -----------------------------------------------------------------------------
  13 Redistribution and use in source and binary forms, with or without
  14 modification, are permitted provided that the following conditions are met:
  15
  16     * Redistributions of source code must retain the above copyright notice,
  17       this list of conditions and the following disclaimer.
  18
  19     * Redistributions in binary form must reproduce the above copyright
  20       notice, this list of conditions and the following disclaimer in the
  21       documentation and/or other materials provided with the distribution.
  22
  23     * Neither the name of the University of Cambridge nor the names of its
  24       contributors may be used to endorse or promote products derived from
  25       this software without specific prior written permission.
  26
  27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  37 POSSIBILITY OF SUCH DAMAGE.
  38 -----------------------------------------------------------------------------
  39 */
  40
  41
  42 /* This module contains the external function pcre2_dfa_match(), which is an
  43 alternative matching function that uses a sort of DFA algorithm (not a true
  44 FSM). This is NOT Perl-compatible, but it has advantages in certain
  45 applications. */
  46
  47
  48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
  49 the performance of his patterns greatly. I could not use it as it stood, as it
  50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
  51 test 7 to loop, and test 9 to crash with a segfault.
  52
  53 The issue is the check for duplicate states, which is done by a simple linear
  54 search up the state list. (Grep for "duplicate" below to find the code.) For
  55 many patterns, there will never be many states active at one time, so a simple
  56 linear search is fine. In patterns that have many active states, it might be a
  57 bottleneck. The suggested code used an indexing scheme to remember which states
  58 had previously been used for each character, and avoided the linear search when
  59 it knew there was no chance of a duplicate. This was implemented when adding
  60 states to the state lists.
  61
  62 I wrote some thread-safe, not-limited code to try something similar at the time
  63 of checking for duplicates (instead of when adding states), using index vectors
  64 on the stack. It did give a 13% improvement with one specially constructed
  65 pattern for certain subject strings, but on other strings and on many of the
  66 simpler patterns in the test suite it did worse. The major problem, I think,
  67 was the extra time to initialize the index. This had to be done for each call
  68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
  69 only once - I suspect this was the cause of the problems with the tests.)
  70
  71 Overall, I concluded that the gains in some cases did not outweigh the losses
  72 in others, so I abandoned this code. */
  73
  74
  75 #ifdef HAVE_CONFIG_H
  76 #include "config.h"
  77 #endif
  78
  79 #define NLBLOCK mb             /* Block containing newline information */
  80 #define PSSTART start_subject  /* Field containing processed string start */
  81 #define PSEND   end_subject    /* Field containing processed string end */
  82
  83 #include "pcre2_internal.h"
  84
  85 #define PUBLIC_DFA_MATCH_OPTIONS \
  86   (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
  87    PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
  88    PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART)
  89
  90
  91 /*************************************************
  92 *      Code parameters and static tables         *
  93 *************************************************/
  94
  95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
  96 into others, under special conditions. A gap of 20 between the blocks should be
  97 enough. The resulting opcodes don't have to be less than 256 because they are
  98 never stored, so we push them well clear of the normal opcodes. */
  99
 100 #define OP_PROP_EXTRA       300
 101 #define OP_EXTUNI_EXTRA     320
 102 #define OP_ANYNL_EXTRA      340
 103 #define OP_HSPACE_EXTRA     360
 104 #define OP_VSPACE_EXTRA     380
 105
 106
 107 /* This table identifies those opcodes that are followed immediately by a
 108 character that is to be tested in some way. This makes it possible to
 109 centralize the loading of these characters. In the case of Type * etc, the
 110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
 111 small value. Non-zero values in the table are the offsets from the opcode where
 112 the character is to be found. ***NOTE*** If the start of this table is
 113 modified, the three tables that follow must also be modified. */
 114
 115 static const uint8_t coptable[] = {
 116   0,                             /* End                                    */
 117   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
 118   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
 119   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
 120   0, 0,                          /* \P, \p                                 */
 121   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
 122   0,                             /* \X                                     */
 123   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
 124   1,                             /* Char                                   */
 125   1,                             /* Chari                                  */
 126   1,                             /* not                                    */
 127   1,                             /* noti                                   */
 128   /* Positive single-char repeats                                          */
 129   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 130   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
 131   1+IMM2_SIZE,                   /* exact                                  */
 132   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
 133   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
 134   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
 135   1+IMM2_SIZE,                   /* exact I                                */
 136   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
 137   /* Negative single-char repeats - only for chars < 256                   */
 138   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 139   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
 140   1+IMM2_SIZE,                   /* NOT exact                              */
 141   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
 142   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
 143   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
 144   1+IMM2_SIZE,                   /* NOT exact I                            */
 145   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
 146   /* Positive type repeats                                                 */
 147   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 148   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
 149   1+IMM2_SIZE,                   /* Type exact                             */
 150   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
 151   /* Character class & ref repeats                                         */
 152   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
 153   0, 0,                          /* CRRANGE, CRMINRANGE                    */
 154   0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
 155   0,                             /* CLASS                                  */
 156   0,                             /* NCLASS                                 */
 157   0,                             /* XCLASS - variable length               */
 158   0,                             /* REF                                    */
 159   0,                             /* REFI                                   */
 160   0,                             /* DNREF                                  */
 161   0,                             /* DNREFI                                 */
 162   0,                             /* RECURSE                                */
 163   0,                             /* CALLOUT                                */
 164   0,                             /* CALLOUT_STR                            */
 165   0,                             /* Alt                                    */
 166   0,                             /* Ket                                    */
 167   0,                             /* KetRmax                                */
 168   0,                             /* KetRmin                                */
 169   0,                             /* KetRpos                                */
 170   0,                             /* Reverse                                */
 171   0,                             /* Assert                                 */
 172   0,                             /* Assert not                             */
 173   0,                             /* Assert behind                          */
 174   0,                             /* Assert behind not                      */
 175   0,                             /* ONCE                                   */
 176   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
 177   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
 178   0, 0,                          /* CREF, DNCREF                           */
 179   0, 0,                          /* RREF, DNRREF                           */
 180   0, 0,                          /* FALSE, TRUE                            */
 181   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
 182   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
 183   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
 184   0, 0,                          /* COMMIT, COMMIT_ARG                     */
 185   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
 186   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
 187 };
 188
 189 /* This table identifies those opcodes that inspect a character. It is used to
 190 remember the fact that a character could have been inspected when the end of
 191 the subject is reached. ***NOTE*** If the start of this table is modified, the
 192 two tables that follow must also be modified. */
 193
 194 static const uint8_t poptable[] = {
 195   0,                             /* End                                    */
 196   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
 197   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
 198   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
 199   1, 1,                          /* \P, \p                                 */
 200   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
 201   1,                             /* \X                                     */
 202   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
 203   1,                             /* Char                                   */
 204   1,                             /* Chari                                  */
 205   1,                             /* not                                    */
 206   1,                             /* noti                                   */
 207   /* Positive single-char repeats                                          */
 208   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 209   1, 1, 1,                       /* upto, minupto, exact                   */
 210   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
 211   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
 212   1, 1, 1,                       /* upto I, minupto I, exact I             */
 213   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
 214   /* Negative single-char repeats - only for chars < 256                   */
 215   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 216   1, 1, 1,                       /* NOT upto, minupto, exact               */
 217   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
 218   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
 219   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
 220   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
 221   /* Positive type repeats                                                 */
 222   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 223   1, 1, 1,                       /* Type upto, minupto, exact              */
 224   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
 225   /* Character class & ref repeats                                         */
 226   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 227   1, 1,                          /* CRRANGE, CRMINRANGE                    */
 228   1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
 229   1,                             /* CLASS                                  */
 230   1,                             /* NCLASS                                 */
 231   1,                             /* XCLASS - variable length               */
 232   0,                             /* REF                                    */
 233   0,                             /* REFI                                   */
 234   0,                             /* DNREF                                  */
 235   0,                             /* DNREFI                                 */
 236   0,                             /* RECURSE                                */
 237   0,                             /* CALLOUT                                */
 238   0,                             /* CALLOUT_STR                            */
 239   0,                             /* Alt                                    */
 240   0,                             /* Ket                                    */
 241   0,                             /* KetRmax                                */
 242   0,                             /* KetRmin                                */
 243   0,                             /* KetRpos                                */
 244   0,                             /* Reverse                                */
 245   0,                             /* Assert                                 */
 246   0,                             /* Assert not                             */
 247   0,                             /* Assert behind                          */
 248   0,                             /* Assert behind not                      */
 249   0,                             /* ONCE                                   */
 250   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
 251   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
 252   0, 0,                          /* CREF, DNCREF                           */
 253   0, 0,                          /* RREF, DNRREF                           */
 254   0, 0,                          /* FALSE, TRUE                            */
 255   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
 256   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
 257   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
 258   0, 0,                          /* COMMIT, COMMIT_ARG                     */
 259   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
 260   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
 261 };
 262
 263 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
 264 and \w */
 265
 266 static const uint8_t toptable1[] = {
 267   0, 0, 0, 0, 0, 0,
 268   ctype_digit, ctype_digit,
 269   ctype_space, ctype_space,
 270   ctype_word,  ctype_word,
 271   0, 0                            /* OP_ANY, OP_ALLANY */
 272 };
 273
 274 static const uint8_t toptable2[] = {
 275   0, 0, 0, 0, 0, 0,
 276   ctype_digit, 0,
 277   ctype_space, 0,
 278   ctype_word,  0,
 279   1, 1                            /* OP_ANY, OP_ALLANY */
 280 };
 281
 282
 283 /* Structure for holding data about a particular state, which is in effect the
 284 current data for an active path through the match tree. It must consist
 285 entirely of ints because the working vector we are passed, and which we put
 286 these structures in, is a vector of ints. */
 287
 288 typedef struct stateblock {
 289   int offset;                     /* Offset to opcode (-ve has meaning) */
 290   int count;                      /* Count for repeats */
 291   int data;                       /* Some use extra data */
 292 } stateblock;
 293
 294 #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
 295
 296
 297 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
 298 local working space and output vectors that were created on the stack. This has
 299 caused issues for some patterns, especially in small-stack environments such as
 300 Windows. A new scheme is now in use which sets up a vector on the stack, but if
 301 this is too small, heap memory is used, up to the heap_limit. The main
 302 parameters are all numbers of ints because the workspace is a vector of ints.
 303
 304 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
 305 defined in pcre2_internal.h so as to be available to pcre2test when it is
 306 finding the minimum heap requirement for a match. */
 307
 308 #define OVEC_UNIT  (sizeof(PCRE2_SIZE)/sizeof(int))
 309
 310 #define RWS_BASE_SIZE   (DFA_START_RWS_SIZE/sizeof(int))  /* Stack vector */
 311 #define RWS_RSIZE       1000                    /* Work size for recursion */
 312 #define RWS_OVEC_RSIZE  (1000*OVEC_UNIT)        /* Ovector for recursion */
 313 #define RWS_OVEC_OSIZE  (2*OVEC_UNIT)           /* Ovector in other cases */
 314
 315 /* This structure is at the start of each workspace block. */
 316
 317 typedef struct RWS_anchor {
 318   struct RWS_anchor *next;
 319   unsigned int size;  /* Number of ints */
 320   unsigned int free;  /* Number of ints */
 321 } RWS_anchor;
 322
 323 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
 324
 325
 326
 327 /*************************************************
 328 *               Process a callout                *
 329 *************************************************/
 330
 331 /* This function is called to perform a callout.
 332
 333 Arguments:
 334   code              current code pointer
 335   offsets           points to current capture offsets
 336   current_subject   start of current subject match
 337   ptr               current position in subject
 338   mb                the match block
 339   extracode         extra code offset when called from condition
 340   lengthptr         where to return the callout length
 341
 342 Returns:            the return from the callout
 343 */
 344
 345 static int
 346 do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
 347   PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
 348   PCRE2_SIZE *lengthptr)
 349 {
 350 pcre2_callout_block *cb = mb->cb;
 351
 352 *lengthptr = (code[extracode] == OP_CALLOUT)?
 353   (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
 354   (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
 355
 356 if (mb->callout == NULL) return 0;    /* No callout provided */
 357
 358 /* Fixed fields in the callout block are set once and for all at the start of
 359 matching. */
 360
 361 cb->offset_vector    = offsets;
 362 cb->start_match      = (PCRE2_SIZE)(current_subject - mb->start_subject);
 363 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
 364 cb->pattern_position = GET(code, 1 + extracode);
 365 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
 366
 367 if (code[extracode] == OP_CALLOUT)
 368   {
 369   cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
 370   cb->callout_string_offset = 0;
 371   cb->callout_string = NULL;
 372   cb->callout_string_length = 0;
 373   }
 374 else
 375   {
 376   cb->callout_number = 0;
 377   cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
 378   cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
 379   cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
 380   }
 381
 382 return (mb->callout)(cb, mb->callout_data);
 383 }
 384
 385
 386
 387 /*************************************************
 388 *         Expand local workspace memory          *
 389 *************************************************/
 390
 391 /* This function is called when internal_dfa_match() is about to be called
 392 recursively and there is insufficient working space left in the current
 393 workspace block. If there's an existing next block, use it; otherwise get a new
 394 block unless the heap limit is reached.
 395
 396 Arguments:
 397   rwsptr     pointer to block pointer (updated)
 398   ovecsize   space needed for an ovector
 399   mb         the match block
 400
 401 Returns:     0 rwsptr has been updated
 402             !0 an error code
 403 */
 404
 405 static int
 406 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
 407 {
 408 RWS_anchor *rws = *rwsptr;
 409 RWS_anchor *new;
 410
 411 if (rws->next != NULL)
 412   {
 413   new = rws->next;
 414   }
 415
 416 /* All sizes are in units of sizeof(int), except for mb->heaplimit, which is in
 417 kibibytes. */
 418
 419 else
 420   {
 421   unsigned int newsize = rws->size * 2;
 422   unsigned int heapleft = (unsigned int)
 423     (((1024/sizeof(int))*mb->heap_limit - mb->heap_used));
 424   if (newsize > heapleft) newsize = heapleft;
 425   if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
 426     return PCRE2_ERROR_HEAPLIMIT;
 427   new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
 428   if (new == NULL) return PCRE2_ERROR_NOMEMORY;
 429   mb->heap_used += newsize;
 430   new->next = NULL;
 431   new->size = newsize;
 432   rws->next = new;
 433   }
 434
 435 new->free = new->size - RWS_ANCHOR_SIZE;
 436 *rwsptr = new;
 437 return 0;
 438 }
 439
 440
 441
 442 /*************************************************
 443 *     Match a Regular Expression - DFA engine    *
 444 *************************************************/
 445
 446 /* This internal function applies a compiled pattern to a subject string,
 447 starting at a given point, using a DFA engine. This function is called from the
 448 external one, possibly multiple times if the pattern is not anchored. The
 449 function calls itself recursively for some kinds of subpattern.
 450
 451 Arguments:
 452   mb                the match_data block with fixed information
 453   this_start_code   the opening bracket of this subexpression's code
 454   current_subject   where we currently are in the subject string
 455   start_offset      start offset in the subject string
 456   offsets           vector to contain the matching string offsets
 457   offsetcount       size of same
 458   workspace         vector of workspace
 459   wscount           size of same
 460   rlevel            function call recursion level
 461
 462 Returns:            > 0 => number of match offset pairs placed in offsets
 463                     = 0 => offsets overflowed; longest matches are present
 464                      -1 => failed to match
 465                    < -1 => some kind of unexpected problem
 466
 467 The following macros are used for adding states to the two state vectors (one
 468 for the current character, one for the following character). */
 469
 470 #define ADD_ACTIVE(x,y) \
 471   if (active_count++ < wscount) \
 472     { \
 473     next_active_state->offset = (x); \
 474     next_active_state->count  = (y); \
 475     next_active_state++; \
 476     } \
 477   else return PCRE2_ERROR_DFA_WSSIZE
 478
 479 #define ADD_ACTIVE_DATA(x,y,z) \
 480   if (active_count++ < wscount) \
 481     { \
 482     next_active_state->offset = (x); \
 483     next_active_state->count  = (y); \
 484     next_active_state->data   = (z); \
 485     next_active_state++; \
 486     } \
 487   else return PCRE2_ERROR_DFA_WSSIZE
 488
 489 #define ADD_NEW(x,y) \
 490   if (new_count++ < wscount) \
 491     { \
 492     next_new_state->offset = (x); \
 493     next_new_state->count  = (y); \
 494     next_new_state++; \
 495     } \
 496   else return PCRE2_ERROR_DFA_WSSIZE
 497
 498 #define ADD_NEW_DATA(x,y,z) \
 499   if (new_count++ < wscount) \
 500     { \
 501     next_new_state->offset = (x); \
 502     next_new_state->count  = (y); \
 503     next_new_state->data   = (z); \
 504     next_new_state++; \
 505     } \
 506   else return PCRE2_ERROR_DFA_WSSIZE
 507
 508 /* And now, here is the code */
 509
 510 static int
 511 internal_dfa_match(
 512   dfa_match_block *mb,
 513   PCRE2_SPTR this_start_code,
 514   PCRE2_SPTR current_subject,
 515   PCRE2_SIZE start_offset,
 516   PCRE2_SIZE *offsets,
 517   uint32_t offsetcount,
 518   int *workspace,
 519   int wscount,
 520   uint32_t rlevel,
 521   int *RWS)
 522 {
 523 stateblock *active_states, *new_states, *temp_states;
 524 stateblock *next_active_state, *next_new_state;
 525 const uint8_t *ctypes, *lcc, *fcc;
 526 PCRE2_SPTR ptr;
 527 PCRE2_SPTR end_code;
 528 dfa_recursion_info new_recursive;
 529 int active_count, new_count, match_count;
 530
 531 /* Some fields in the mb block are frequently referenced, so we load them into
 532 independent variables in the hope that this will perform better. */
 533
 534 PCRE2_SPTR start_subject = mb->start_subject;
 535 PCRE2_SPTR end_subject = mb->end_subject;
 536 PCRE2_SPTR start_code = mb->start_code;
 537
 538 #ifdef SUPPORT_UNICODE
 539 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
 540 #else
 541 BOOL utf = FALSE;
 542 #endif
 543
 544 BOOL reset_could_continue = FALSE;
 545
 546 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
 547 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
 548 offsetcount &= (uint32_t)(-2);  /* Round down */
 549
 550 wscount -= 2;
 551 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
 552           (2 * INTS_PER_STATEBLOCK);
 553
 554 ctypes = mb->tables + ctypes_offset;
 555 lcc = mb->tables + lcc_offset;
 556 fcc = mb->tables + fcc_offset;
 557
 558 match_count = PCRE2_ERROR_NOMATCH;   /* A negative number */
 559
 560 active_states = (stateblock *)(workspace + 2);
 561 next_new_state = new_states = active_states + wscount;
 562 new_count = 0;
 563
 564 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
 565 the alternative states onto the list, and find out where the end is. This
 566 makes is possible to use this function recursively, when we want to stop at a
 567 matching internal ket rather than at the end.
 568
 569 If we are dealing with a backward assertion we have to find out the maximum
 570 amount to move back, and set up each alternative appropriately. */
 571
 572 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
 573   {
 574   size_t max_back = 0;
 575   size_t gone_back;
 576
 577   end_code = this_start_code;
 578   do
 579     {
 580     size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
 581     if (back > max_back) max_back = back;
 582     end_code += GET(end_code, 1);
 583     }
 584   while (*end_code == OP_ALT);
 585
 586   /* If we can't go back the amount required for the longest lookbehind
 587   pattern, go back as far as we can; some alternatives may still be viable. */
 588
 589 #ifdef SUPPORT_UNICODE
 590   /* In character mode we have to step back character by character */
 591
 592   if (utf)
 593     {
 594     for (gone_back = 0; gone_back < max_back; gone_back++)
 595       {
 596       if (current_subject <= start_subject) break;
 597       current_subject--;
 598       ACROSSCHAR(current_subject > start_subject, current_subject,
 599         current_subject--);
 600       }
 601     }
 602   else
 603 #endif
 604
 605   /* In byte-mode we can do this quickly. */
 606
 607     {
 608     size_t current_offset = (size_t)(current_subject - start_subject);
 609     gone_back = (current_offset < max_back)? current_offset : max_back;
 610     current_subject -= gone_back;
 611     }
 612
 613   /* Save the earliest consulted character */
 614
 615   if (current_subject < mb->start_used_ptr)
 616     mb->start_used_ptr = current_subject;
 617
 618   /* Now we can process the individual branches. There will be an OP_REVERSE at
 619   the start of each branch, except when the length of the branch is zero. */
 620
 621   end_code = this_start_code;
 622   do
 623     {
 624     uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
 625     size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
 626     if (back <= gone_back)
 627       {
 628       int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
 629       ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
 630       }
 631     end_code += GET(end_code, 1);
 632     }
 633   while (*end_code == OP_ALT);
 634  }
 635
 636 /* This is the code for a "normal" subpattern (not a backward assertion). The
 637 start of a whole pattern is always one of these. If we are at the top level,
 638 we may be asked to restart matching from the same point that we reached for a
 639 previous partial match. We still have to scan through the top-level branches to
 640 find the end state. */
 641
 642 else
 643   {
 644   end_code = this_start_code;
 645
 646   /* Restarting */
 647
 648   if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
 649     {
 650     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
 651     new_count = workspace[1];
 652     if (!workspace[0])
 653       memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
 654     }
 655
 656   /* Not restarting */
 657
 658   else
 659     {
 660     int length = 1 + LINK_SIZE +
 661       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
 662         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
 663         ? IMM2_SIZE:0);
 664     do
 665       {
 666       ADD_NEW((int)(end_code - start_code + length), 0);
 667       end_code += GET(end_code, 1);
 668       length = 1 + LINK_SIZE;
 669       }
 670     while (*end_code == OP_ALT);
 671     }
 672   }
 673
 674 workspace[0] = 0;    /* Bit indicating which vector is current */
 675
 676 /* Loop for scanning the subject */
 677
 678 ptr = current_subject;
 679 for (;;)
 680   {
 681   int i, j;
 682   int clen, dlen;
 683   uint32_t c, d;
 684   int forced_fail = 0;
 685   BOOL partial_newline = FALSE;
 686   BOOL could_continue = reset_could_continue;
 687   reset_could_continue = FALSE;
 688
 689   if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
 690
 691   /* Make the new state list into the active state list and empty the
 692   new state list. */
 693
 694   temp_states = active_states;
 695   active_states = new_states;
 696   new_states = temp_states;
 697   active_count = new_count;
 698   new_count = 0;
 699
 700   workspace[0] ^= 1;              /* Remember for the restarting feature */
 701   workspace[1] = active_count;
 702
 703   /* Set the pointers for adding new states */
 704
 705   next_active_state = active_states + active_count;
 706   next_new_state = new_states;
 707
 708   /* Load the current character from the subject outside the loop, as many
 709   different states may want to look at it, and we assume that at least one
 710   will. */
 711
 712   if (ptr < end_subject)
 713     {
 714     clen = 1;        /* Number of data items in the character */
 715 #ifdef SUPPORT_UNICODE
 716     GETCHARLENTEST(c, ptr, clen);
 717 #else
 718     c = *ptr;
 719 #endif  /* SUPPORT_UNICODE */
 720     }
 721   else
 722     {
 723     clen = 0;        /* This indicates the end of the subject */
 724     c = NOTACHAR;    /* This value should never actually be used */
 725     }
 726
 727   /* Scan up the active states and act on each one. The result of an action
 728   may be to add more states to the currently active list (e.g. on hitting a
 729   parenthesis) or it may be to put states on the new list, for considering
 730   when we move the character pointer on. */
 731
 732   for (i = 0; i < active_count; i++)
 733     {
 734     stateblock *current_state = active_states + i;
 735     BOOL caseless = FALSE;
 736     PCRE2_SPTR code;
 737     uint32_t codevalue;
 738     int state_offset = current_state->offset;
 739     int rrc;
 740     int count;
 741
 742     /* A negative offset is a special case meaning "hold off going to this
 743     (negated) state until the number of characters in the data field have
 744     been skipped". If the could_continue flag was passed over from a previous
 745     state, arrange for it to passed on. */
 746
 747     if (state_offset < 0)
 748       {
 749       if (current_state->data > 0)
 750         {
 751         ADD_NEW_DATA(state_offset, current_state->count,
 752           current_state->data - 1);
 753         if (could_continue) reset_could_continue = TRUE;
 754         continue;
 755         }
 756       else
 757         {
 758         current_state->offset = state_offset = -state_offset;
 759         }
 760       }
 761
 762     /* Check for a duplicate state with the same count, and skip if found.
 763     See the note at the head of this module about the possibility of improving
 764     performance here. */
 765
 766     for (j = 0; j < i; j++)
 767       {
 768       if (active_states[j].offset == state_offset &&
 769           active_states[j].count == current_state->count)
 770         goto NEXT_ACTIVE_STATE;
 771       }
 772
 773     /* The state offset is the offset to the opcode */
 774
 775     code = start_code + state_offset;
 776     codevalue = *code;
 777
 778     /* If this opcode inspects a character, but we are at the end of the
 779     subject, remember the fact for use when testing for a partial match. */
 780
 781     if (clen == 0 && poptable[codevalue] != 0)
 782       could_continue = TRUE;
 783
 784     /* If this opcode is followed by an inline character, load it. It is
 785     tempting to test for the presence of a subject character here, but that
 786     is wrong, because sometimes zero repetitions of the subject are
 787     permitted.
 788
 789     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
 790     argument that is not a data character - but is always one byte long because
 791     the values are small. We have to take special action to deal with  \P, \p,
 792     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
 793     these ones to new opcodes. */
 794
 795     if (coptable[codevalue] > 0)
 796       {
 797       dlen = 1;
 798 #ifdef SUPPORT_UNICODE
 799       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
 800 #endif  /* SUPPORT_UNICODE */
 801       d = code[coptable[codevalue]];
 802       if (codevalue >= OP_TYPESTAR)
 803         {
 804         switch(d)
 805           {
 806           case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
 807           case OP_NOTPROP:
 808           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
 809           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
 810           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
 811           case OP_NOT_HSPACE:
 812           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
 813           case OP_NOT_VSPACE:
 814           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
 815           default: break;
 816           }
 817         }
 818       }
 819     else
 820       {
 821       dlen = 0;         /* Not strictly necessary, but compilers moan */
 822       d = NOTACHAR;     /* if these variables are not set. */
 823       }
 824
 825
 826     /* Now process the individual opcodes */
 827
 828     switch (codevalue)
 829       {
 830 /* ========================================================================== */
 831       /* These cases are never obeyed. This is a fudge that causes a compile-
 832       time error if the vectors coptable or poptable, which are indexed by
 833       opcode, are not the correct length. It seems to be the only way to do
 834       such a check at compile time, as the sizeof() operator does not work
 835       in the C preprocessor. */
 836
 837       case OP_TABLE_LENGTH:
 838       case OP_TABLE_LENGTH +
 839         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
 840          (sizeof(poptable) == OP_TABLE_LENGTH)):
 841       return 0;
 842
 843 /* ========================================================================== */
 844       /* Reached a closing bracket. If not at the end of the pattern, carry
 845       on with the next opcode. For repeating opcodes, also add the repeat
 846       state. Note that KETRPOS will always be encountered at the end of the
 847       subpattern, because the possessive subpattern repeats are always handled
 848       using recursive calls. Thus, it never adds any new states.
 849
 850       At the end of the (sub)pattern, unless we have an empty string and
 851       PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
 852       start of the subject, save the match data, shifting up all previous
 853       matches so we always have the longest first. */
 854
 855       case OP_KET:
 856       case OP_KETRMIN:
 857       case OP_KETRMAX:
 858       case OP_KETRPOS:
 859       if (code != end_code)
 860         {
 861         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
 862         if (codevalue != OP_KET)
 863           {
 864           ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
 865           }
 866         }
 867       else
 868         {
 869         if (ptr > current_subject ||
 870             ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
 871               ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
 872                 current_subject > start_subject + mb->start_offset)))
 873           {
 874           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
 875             else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
 876               match_count = 0;
 877           count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
 878           if (count > 0) (void)memmove(offsets + 2, offsets,
 879             (size_t)count * sizeof(PCRE2_SIZE));
 880           if (offsetcount >= 2)
 881             {
 882             offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
 883             offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
 884             }
 885           if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
 886           }
 887         }
 888       break;
 889
 890 /* ========================================================================== */
 891       /* These opcodes add to the current list of states without looking
 892       at the current character. */
 893
 894       /*-----------------------------------------------------------------*/
 895       case OP_ALT:
 896       do { code += GET(code, 1); } while (*code == OP_ALT);
 897       ADD_ACTIVE((int)(code - start_code), 0);
 898       break;
 899
 900       /*-----------------------------------------------------------------*/
 901       case OP_BRA:
 902       case OP_SBRA:
 903       do
 904         {
 905         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 906         code += GET(code, 1);
 907         }
 908       while (*code == OP_ALT);
 909       break;
 910
 911       /*-----------------------------------------------------------------*/
 912       case OP_CBRA:
 913       case OP_SCBRA:
 914       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
 915       code += GET(code, 1);
 916       while (*code == OP_ALT)
 917         {
 918         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
 919         code += GET(code, 1);
 920         }
 921       break;
 922
 923       /*-----------------------------------------------------------------*/
 924       case OP_BRAZERO:
 925       case OP_BRAMINZERO:
 926       ADD_ACTIVE(state_offset + 1, 0);
 927       code += 1 + GET(code, 2);
 928       while (*code == OP_ALT) code += GET(code, 1);
 929       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 930       break;
 931
 932       /*-----------------------------------------------------------------*/
 933       case OP_SKIPZERO:
 934       code += 1 + GET(code, 2);
 935       while (*code == OP_ALT) code += GET(code, 1);
 936       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 937       break;
 938
 939       /*-----------------------------------------------------------------*/
 940       case OP_CIRC:
 941       if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
 942         { ADD_ACTIVE(state_offset + 1, 0); }
 943       break;
 944
 945       /*-----------------------------------------------------------------*/
 946       case OP_CIRCM:
 947       if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
 948           ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
 949             && WAS_NEWLINE(ptr)))
 950         { ADD_ACTIVE(state_offset + 1, 0); }
 951       break;
 952
 953       /*-----------------------------------------------------------------*/
 954       case OP_EOD:
 955       if (ptr >= end_subject)
 956         {
 957         if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
 958           could_continue = TRUE;
 959         else { ADD_ACTIVE(state_offset + 1, 0); }
 960         }
 961       break;
 962
 963       /*-----------------------------------------------------------------*/
 964       case OP_SOD:
 965       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 966       break;
 967
 968       /*-----------------------------------------------------------------*/
 969       case OP_SOM:
 970       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
 971       break;
 972
 973
 974 /* ========================================================================== */
 975       /* These opcodes inspect the next subject character, and sometimes
 976       the previous one as well, but do not have an argument. The variable
 977       clen contains the length of the current character and is zero if we are
 978       at the end of the subject. */
 979
 980       /*-----------------------------------------------------------------*/
 981       case OP_ANY:
 982       if (clen > 0 && !IS_NEWLINE(ptr))
 983         {
 984         if (ptr + 1 >= mb->end_subject &&
 985             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
 986             NLBLOCK->nltype == NLTYPE_FIXED &&
 987             NLBLOCK->nllen == 2 &&
 988             c == NLBLOCK->nl[0])
 989           {
 990           could_continue = partial_newline = TRUE;
 991           }
 992         else
 993           {
 994           ADD_NEW(state_offset + 1, 0);
 995           }
 996         }
 997       break;
 998
 999       /*-----------------------------------------------------------------*/
1000       case OP_ALLANY:
1001       if (clen > 0)
1002         { ADD_NEW(state_offset + 1, 0); }
1003       break;
1004
1005       /*-----------------------------------------------------------------*/
1006       case OP_EODN:
1007       if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1008         could_continue = TRUE;
1009       else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1010         { ADD_ACTIVE(state_offset + 1, 0); }
1011       break;
1012
1013       /*-----------------------------------------------------------------*/
1014       case OP_DOLL:
1015       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1016         {
1017         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1018           could_continue = TRUE;
1019         else if (clen == 0 ||
1020             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1021                (ptr == end_subject - mb->nllen)
1022             ))
1023           { ADD_ACTIVE(state_offset + 1, 0); }
1024         else if (ptr + 1 >= mb->end_subject &&
1025                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1026                  NLBLOCK->nltype == NLTYPE_FIXED &&
1027                  NLBLOCK->nllen == 2 &&
1028                  c == NLBLOCK->nl[0])
1029           {
1030           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1031             {
1032             reset_could_continue = TRUE;
1033             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1034             }
1035           else could_continue = partial_newline = TRUE;
1036           }
1037         }
1038       break;
1039
1040       /*-----------------------------------------------------------------*/
1041       case OP_DOLLM:
1042       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1043         {
1044         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1045           could_continue = TRUE;
1046         else if (clen == 0 ||
1047             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1048           { ADD_ACTIVE(state_offset + 1, 0); }
1049         else if (ptr + 1 >= mb->end_subject &&
1050                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1051                  NLBLOCK->nltype == NLTYPE_FIXED &&
1052                  NLBLOCK->nllen == 2 &&
1053                  c == NLBLOCK->nl[0])
1054           {
1055           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1056             {
1057             reset_could_continue = TRUE;
1058             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1059             }
1060           else could_continue = partial_newline = TRUE;
1061           }
1062         }
1063       else if (IS_NEWLINE(ptr))
1064         { ADD_ACTIVE(state_offset + 1, 0); }
1065       break;
1066
1067       /*-----------------------------------------------------------------*/
1068
1069       case OP_DIGIT:
1070       case OP_WHITESPACE:
1071       case OP_WORDCHAR:
1072       if (clen > 0 && c < 256 &&
1073             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1074         { ADD_NEW(state_offset + 1, 0); }
1075       break;
1076
1077       /*-----------------------------------------------------------------*/
1078       case OP_NOT_DIGIT:
1079       case OP_NOT_WHITESPACE:
1080       case OP_NOT_WORDCHAR:
1081       if (clen > 0 && (c >= 256 ||
1082             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1083         { ADD_NEW(state_offset + 1, 0); }
1084       break;
1085
1086       /*-----------------------------------------------------------------*/
1087       case OP_WORD_BOUNDARY:
1088       case OP_NOT_WORD_BOUNDARY:
1089         {
1090         int left_word, right_word;
1091
1092         if (ptr > start_subject)
1093           {
1094           PCRE2_SPTR temp = ptr - 1;
1095           if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1096 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1097           if (utf) { BACKCHAR(temp); }
1098 #endif
1099           GETCHARTEST(d, temp);
1100 #ifdef SUPPORT_UNICODE
1101           if ((mb->poptions & PCRE2_UCP) != 0)
1102             {
1103             if (d == '_') left_word = TRUE; else
1104               {
1105               uint32_t cat = UCD_CATEGORY(d);
1106               left_word = (cat == ucp_L || cat == ucp_N);
1107               }
1108             }
1109           else
1110 #endif
1111           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1112           }
1113         else left_word = FALSE;
1114
1115         if (clen > 0)
1116           {
1117           if (ptr >= mb->last_used_ptr)
1118             {
1119             PCRE2_SPTR temp = ptr + 1;
1120 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1121             if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1122 #endif
1123             mb->last_used_ptr = temp;
1124             }
1125 #ifdef SUPPORT_UNICODE
1126           if ((mb->poptions & PCRE2_UCP) != 0)
1127             {
1128             if (c == '_') right_word = TRUE; else
1129               {
1130               uint32_t cat = UCD_CATEGORY(c);
1131               right_word = (cat == ucp_L || cat == ucp_N);
1132               }
1133             }
1134           else
1135 #endif
1136           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1137           }
1138         else right_word = FALSE;
1139
1140         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1141           { ADD_ACTIVE(state_offset + 1, 0); }
1142         }
1143       break;
1144
1145
1146       /*-----------------------------------------------------------------*/
1147       /* Check the next character by Unicode property. We will get here only
1148       if the support is in the binary; otherwise a compile-time error occurs.
1149       */
1150
1151 #ifdef SUPPORT_UNICODE
1152       case OP_PROP:
1153       case OP_NOTPROP:
1154       if (clen > 0)
1155         {
1156         BOOL OK;
1157         const uint32_t *cp;
1158         const ucd_record * prop = GET_UCD(c);
1159         switch(code[1])
1160           {
1161           case PT_ANY:
1162           OK = TRUE;
1163           break;
1164
1165           case PT_LAMP:
1166           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1167                prop->chartype == ucp_Lt;
1168           break;
1169
1170           case PT_GC:
1171           OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1172           break;
1173
1174           case PT_PC:
1175           OK = prop->chartype == code[2];
1176           break;
1177
1178           case PT_SC:
1179           OK = prop->script == code[2];
1180           break;
1181
1182           /* These are specials for combination cases. */
1183
1184           case PT_ALNUM:
1185           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1186                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1187           break;
1188
1189           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1190           which means that Perl space and POSIX space are now identical. PCRE
1191           was changed at release 8.34. */
1192
1193           case PT_SPACE:    /* Perl space */
1194           case PT_PXSPACE:  /* POSIX space */
1195           switch(c)
1196             {
1197             HSPACE_CASES:
1198             VSPACE_CASES:
1199             OK = TRUE;
1200             break;
1201
1202             default:
1203             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1204             break;
1205             }
1206           break;
1207
1208           case PT_WORD:
1209           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1210                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1211                c == CHAR_UNDERSCORE;
1212           break;
1213
1214           case PT_CLIST:
1215           cp = PRIV(ucd_caseless_sets) + code[2];
1216           for (;;)
1217             {
1218             if (c < *cp) { OK = FALSE; break; }
1219             if (c == *cp++) { OK = TRUE; break; }
1220             }
1221           break;
1222
1223           case PT_UCNC:
1224           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1225                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1226                c >= 0xe000;
1227           break;
1228
1229           /* Should never occur, but keep compilers from grumbling. */
1230
1231           default:
1232           OK = codevalue != OP_PROP;
1233           break;
1234           }
1235
1236         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1237         }
1238       break;
1239 #endif
1240
1241
1242
1243 /* ========================================================================== */
1244       /* These opcodes likewise inspect the subject character, but have an
1245       argument that is not a data character. It is one of these opcodes:
1246       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1247       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1248
1249       case OP_TYPEPLUS:
1250       case OP_TYPEMINPLUS:
1251       case OP_TYPEPOSPLUS:
1252       count = current_state->count;  /* Already matched */
1253       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1254       if (clen > 0)
1255         {
1256         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1257             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1258             NLBLOCK->nltype == NLTYPE_FIXED &&
1259             NLBLOCK->nllen == 2 &&
1260             c == NLBLOCK->nl[0])
1261           {
1262           could_continue = partial_newline = TRUE;
1263           }
1264         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1265             (c < 256 &&
1266               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1267               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1268           {
1269           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1270             {
1271             active_count--;            /* Remove non-match possibility */
1272             next_active_state--;
1273             }
1274           count++;
1275           ADD_NEW(state_offset, count);
1276           }
1277         }
1278       break;
1279
1280       /*-----------------------------------------------------------------*/
1281       case OP_TYPEQUERY:
1282       case OP_TYPEMINQUERY:
1283       case OP_TYPEPOSQUERY:
1284       ADD_ACTIVE(state_offset + 2, 0);
1285       if (clen > 0)
1286         {
1287         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1288             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1289             NLBLOCK->nltype == NLTYPE_FIXED &&
1290             NLBLOCK->nllen == 2 &&
1291             c == NLBLOCK->nl[0])
1292           {
1293           could_continue = partial_newline = TRUE;
1294           }
1295         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1296             (c < 256 &&
1297               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1298               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1299           {
1300           if (codevalue == OP_TYPEPOSQUERY)
1301             {
1302             active_count--;            /* Remove non-match possibility */
1303             next_active_state--;
1304             }
1305           ADD_NEW(state_offset + 2, 0);
1306           }
1307         }
1308       break;
1309
1310       /*-----------------------------------------------------------------*/
1311       case OP_TYPESTAR:
1312       case OP_TYPEMINSTAR:
1313       case OP_TYPEPOSSTAR:
1314       ADD_ACTIVE(state_offset + 2, 0);
1315       if (clen > 0)
1316         {
1317         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1318             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1319             NLBLOCK->nltype == NLTYPE_FIXED &&
1320             NLBLOCK->nllen == 2 &&
1321             c == NLBLOCK->nl[0])
1322           {
1323           could_continue = partial_newline = TRUE;
1324           }
1325         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1326             (c < 256 &&
1327               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1328               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1329           {
1330           if (codevalue == OP_TYPEPOSSTAR)
1331             {
1332             active_count--;            /* Remove non-match possibility */
1333             next_active_state--;
1334             }
1335           ADD_NEW(state_offset, 0);
1336           }
1337         }
1338       break;
1339
1340       /*-----------------------------------------------------------------*/
1341       case OP_TYPEEXACT:
1342       count = current_state->count;  /* Number already matched */
1343       if (clen > 0)
1344         {
1345         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1346             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1347             NLBLOCK->nltype == NLTYPE_FIXED &&
1348             NLBLOCK->nllen == 2 &&
1349             c == NLBLOCK->nl[0])
1350           {
1351           could_continue = partial_newline = TRUE;
1352           }
1353         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1354             (c < 256 &&
1355               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1356               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1357           {
1358           if (++count >= (int)GET2(code, 1))
1359             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1360           else
1361             { ADD_NEW(state_offset, count); }
1362           }
1363         }
1364       break;
1365
1366       /*-----------------------------------------------------------------*/
1367       case OP_TYPEUPTO:
1368       case OP_TYPEMINUPTO:
1369       case OP_TYPEPOSUPTO:
1370       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1371       count = current_state->count;  /* Number already matched */
1372       if (clen > 0)
1373         {
1374         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1375             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1376             NLBLOCK->nltype == NLTYPE_FIXED &&
1377             NLBLOCK->nllen == 2 &&
1378             c == NLBLOCK->nl[0])
1379           {
1380           could_continue = partial_newline = TRUE;
1381           }
1382         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1383             (c < 256 &&
1384               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1385               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1386           {
1387           if (codevalue == OP_TYPEPOSUPTO)
1388             {
1389             active_count--;           /* Remove non-match possibility */
1390             next_active_state--;
1391             }
1392           if (++count >= (int)GET2(code, 1))
1393             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1394           else
1395             { ADD_NEW(state_offset, count); }
1396           }
1397         }
1398       break;
1399
1400 /* ========================================================================== */
1401       /* These are virtual opcodes that are used when something like
1402       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1403       argument. It keeps the code above fast for the other cases. The argument
1404       is in the d variable. */
1405
1406 #ifdef SUPPORT_UNICODE
1407       case OP_PROP_EXTRA + OP_TYPEPLUS:
1408       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1409       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1410       count = current_state->count;           /* Already matched */
1411       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1412       if (clen > 0)
1413         {
1414         BOOL OK;
1415         const uint32_t *cp;
1416         const ucd_record * prop = GET_UCD(c);
1417         switch(code[2])
1418           {
1419           case PT_ANY:
1420           OK = TRUE;
1421           break;
1422
1423           case PT_LAMP:
1424           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1425             prop->chartype == ucp_Lt;
1426           break;
1427
1428           case PT_GC:
1429           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1430           break;
1431
1432           case PT_PC:
1433           OK = prop->chartype == code[3];
1434           break;
1435
1436           case PT_SC:
1437           OK = prop->script == code[3];
1438           break;
1439
1440           /* These are specials for combination cases. */
1441
1442           case PT_ALNUM:
1443           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1444                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1445           break;
1446
1447           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1448           which means that Perl space and POSIX space are now identical. PCRE
1449           was changed at release 8.34. */
1450
1451           case PT_SPACE:    /* Perl space */
1452           case PT_PXSPACE:  /* POSIX space */
1453           switch(c)
1454             {
1455             HSPACE_CASES:
1456             VSPACE_CASES:
1457             OK = TRUE;
1458             break;
1459
1460             default:
1461             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1462             break;
1463             }
1464           break;
1465
1466           case PT_WORD:
1467           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1468                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1469                c == CHAR_UNDERSCORE;
1470           break;
1471
1472           case PT_CLIST:
1473           cp = PRIV(ucd_caseless_sets) + code[3];
1474           for (;;)
1475             {
1476             if (c < *cp) { OK = FALSE; break; }
1477             if (c == *cp++) { OK = TRUE; break; }
1478             }
1479           break;
1480
1481           case PT_UCNC:
1482           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1483                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1484                c >= 0xe000;
1485           break;
1486
1487           /* Should never occur, but keep compilers from grumbling. */
1488
1489           default:
1490           OK = codevalue != OP_PROP;
1491           break;
1492           }
1493
1494         if (OK == (d == OP_PROP))
1495           {
1496           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1497             {
1498             active_count--;           /* Remove non-match possibility */
1499             next_active_state--;
1500             }
1501           count++;
1502           ADD_NEW(state_offset, count);
1503           }
1504         }
1505       break;
1506
1507       /*-----------------------------------------------------------------*/
1508       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1509       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1510       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1511       count = current_state->count;  /* Already matched */
1512       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1513       if (clen > 0)
1514         {
1515         int ncount = 0;
1516         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1517           {
1518           active_count--;           /* Remove non-match possibility */
1519           next_active_state--;
1520           }
1521         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1522           &ncount);
1523         count++;
1524         ADD_NEW_DATA(-state_offset, count, ncount);
1525         }
1526       break;
1527 #endif
1528
1529       /*-----------------------------------------------------------------*/
1530       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1531       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1532       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1533       count = current_state->count;  /* Already matched */
1534       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1535       if (clen > 0)
1536         {
1537         int ncount = 0;
1538         switch (c)
1539           {
1540           case CHAR_VT:
1541           case CHAR_FF:
1542           case CHAR_NEL:
1543 #ifndef EBCDIC
1544           case 0x2028:
1545           case 0x2029:
1546 #endif  /* Not EBCDIC */
1547           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1548           goto ANYNL01;
1549
1550           case CHAR_CR:
1551           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1552           /* Fall through */
1553
1554           ANYNL01:
1555           case CHAR_LF:
1556           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1557             {
1558             active_count--;           /* Remove non-match possibility */
1559             next_active_state--;
1560             }
1561           count++;
1562           ADD_NEW_DATA(-state_offset, count, ncount);
1563           break;
1564
1565           default:
1566           break;
1567           }
1568         }
1569       break;
1570
1571       /*-----------------------------------------------------------------*/
1572       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1573       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1574       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1575       count = current_state->count;  /* Already matched */
1576       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1577       if (clen > 0)
1578         {
1579         BOOL OK;
1580         switch (c)
1581           {
1582           VSPACE_CASES:
1583           OK = TRUE;
1584           break;
1585
1586           default:
1587           OK = FALSE;
1588           break;
1589           }
1590
1591         if (OK == (d == OP_VSPACE))
1592           {
1593           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1594             {
1595             active_count--;           /* Remove non-match possibility */
1596             next_active_state--;
1597             }
1598           count++;
1599           ADD_NEW_DATA(-state_offset, count, 0);
1600           }
1601         }
1602       break;
1603
1604       /*-----------------------------------------------------------------*/
1605       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1606       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1607       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1608       count = current_state->count;  /* Already matched */
1609       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1610       if (clen > 0)
1611         {
1612         BOOL OK;
1613         switch (c)
1614           {
1615           HSPACE_CASES:
1616           OK = TRUE;
1617           break;
1618
1619           default:
1620           OK = FALSE;
1621           break;
1622           }
1623
1624         if (OK == (d == OP_HSPACE))
1625           {
1626           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1627             {
1628             active_count--;           /* Remove non-match possibility */
1629             next_active_state--;
1630             }
1631           count++;
1632           ADD_NEW_DATA(-state_offset, count, 0);
1633           }
1634         }
1635       break;
1636
1637       /*-----------------------------------------------------------------*/
1638 #ifdef SUPPORT_UNICODE
1639       case OP_PROP_EXTRA + OP_TYPEQUERY:
1640       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1641       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1642       count = 4;
1643       goto QS1;
1644
1645       case OP_PROP_EXTRA + OP_TYPESTAR:
1646       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1647       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1648       count = 0;
1649
1650       QS1:
1651
1652       ADD_ACTIVE(state_offset + 4, 0);
1653       if (clen > 0)
1654         {
1655         BOOL OK;
1656         const uint32_t *cp;
1657         const ucd_record * prop = GET_UCD(c);
1658         switch(code[2])
1659           {
1660           case PT_ANY:
1661           OK = TRUE;
1662           break;
1663
1664           case PT_LAMP:
1665           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1666             prop->chartype == ucp_Lt;
1667           break;
1668
1669           case PT_GC:
1670           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1671           break;
1672
1673           case PT_PC:
1674           OK = prop->chartype == code[3];
1675           break;
1676
1677           case PT_SC:
1678           OK = prop->script == code[3];
1679           break;
1680
1681           /* These are specials for combination cases. */
1682
1683           case PT_ALNUM:
1684           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1685                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1686           break;
1687
1688           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1689           which means that Perl space and POSIX space are now identical. PCRE
1690           was changed at release 8.34. */
1691
1692           case PT_SPACE:    /* Perl space */
1693           case PT_PXSPACE:  /* POSIX space */
1694           switch(c)
1695             {
1696             HSPACE_CASES:
1697             VSPACE_CASES:
1698             OK = TRUE;
1699             break;
1700
1701             default:
1702             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1703             break;
1704             }
1705           break;
1706
1707           case PT_WORD:
1708           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1709                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1710                c == CHAR_UNDERSCORE;
1711           break;
1712
1713           case PT_CLIST:
1714           cp = PRIV(ucd_caseless_sets) + code[3];
1715           for (;;)
1716             {
1717             if (c < *cp) { OK = FALSE; break; }
1718             if (c == *cp++) { OK = TRUE; break; }
1719             }
1720           break;
1721
1722           case PT_UCNC:
1723           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1724                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1725                c >= 0xe000;
1726           break;
1727
1728           /* Should never occur, but keep compilers from grumbling. */
1729
1730           default:
1731           OK = codevalue != OP_PROP;
1732           break;
1733           }
1734
1735         if (OK == (d == OP_PROP))
1736           {
1737           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1738               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1739             {
1740             active_count--;           /* Remove non-match possibility */
1741             next_active_state--;
1742             }
1743           ADD_NEW(state_offset + count, 0);
1744           }
1745         }
1746       break;
1747
1748       /*-----------------------------------------------------------------*/
1749       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1750       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1751       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1752       count = 2;
1753       goto QS2;
1754
1755       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1756       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1757       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1758       count = 0;
1759
1760       QS2:
1761
1762       ADD_ACTIVE(state_offset + 2, 0);
1763       if (clen > 0)
1764         {
1765         int ncount = 0;
1766         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1767             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1768           {
1769           active_count--;           /* Remove non-match possibility */
1770           next_active_state--;
1771           }
1772         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1773           &ncount);
1774         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1775         }
1776       break;
1777 #endif
1778
1779       /*-----------------------------------------------------------------*/
1780       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1781       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1782       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1783       count = 2;
1784       goto QS3;
1785
1786       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1787       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1788       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1789       count = 0;
1790
1791       QS3:
1792       ADD_ACTIVE(state_offset + 2, 0);
1793       if (clen > 0)
1794         {
1795         int ncount = 0;
1796         switch (c)
1797           {
1798           case CHAR_VT:
1799           case CHAR_FF:
1800           case CHAR_NEL:
1801 #ifndef EBCDIC
1802           case 0x2028:
1803           case 0x2029:
1804 #endif  /* Not EBCDIC */
1805           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1806           goto ANYNL02;
1807
1808           case CHAR_CR:
1809           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1810           /* Fall through */
1811
1812           ANYNL02:
1813           case CHAR_LF:
1814           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1815               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1816             {
1817             active_count--;           /* Remove non-match possibility */
1818             next_active_state--;
1819             }
1820           ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1821           break;
1822
1823           default:
1824           break;
1825           }
1826         }
1827       break;
1828
1829       /*-----------------------------------------------------------------*/
1830       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1831       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1832       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1833       count = 2;
1834       goto QS4;
1835
1836       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1837       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1838       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1839       count = 0;
1840
1841       QS4:
1842       ADD_ACTIVE(state_offset + 2, 0);
1843       if (clen > 0)
1844         {
1845         BOOL OK;
1846         switch (c)
1847           {
1848           VSPACE_CASES:
1849           OK = TRUE;
1850           break;
1851
1852           default:
1853           OK = FALSE;
1854           break;
1855           }
1856         if (OK == (d == OP_VSPACE))
1857           {
1858           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1859               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1860             {
1861             active_count--;           /* Remove non-match possibility */
1862             next_active_state--;
1863             }
1864           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1865           }
1866         }
1867       break;
1868
1869       /*-----------------------------------------------------------------*/
1870       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1871       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1872       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1873       count = 2;
1874       goto QS5;
1875
1876       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1877       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1878       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1879       count = 0;
1880
1881       QS5:
1882       ADD_ACTIVE(state_offset + 2, 0);
1883       if (clen > 0)
1884         {
1885         BOOL OK;
1886         switch (c)
1887           {
1888           HSPACE_CASES:
1889           OK = TRUE;
1890           break;
1891
1892           default:
1893           OK = FALSE;
1894           break;
1895           }
1896
1897         if (OK == (d == OP_HSPACE))
1898           {
1899           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1900               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1901             {
1902             active_count--;           /* Remove non-match possibility */
1903             next_active_state--;
1904             }
1905           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1906           }
1907         }
1908       break;
1909
1910       /*-----------------------------------------------------------------*/
1911 #ifdef SUPPORT_UNICODE
1912       case OP_PROP_EXTRA + OP_TYPEEXACT:
1913       case OP_PROP_EXTRA + OP_TYPEUPTO:
1914       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1915       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1916       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1917         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1918       count = current_state->count;  /* Number already matched */
1919       if (clen > 0)
1920         {
1921         BOOL OK;
1922         const uint32_t *cp;
1923         const ucd_record * prop = GET_UCD(c);
1924         switch(code[1 + IMM2_SIZE + 1])
1925           {
1926           case PT_ANY:
1927           OK = TRUE;
1928           break;
1929
1930           case PT_LAMP:
1931           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1932             prop->chartype == ucp_Lt;
1933           break;
1934
1935           case PT_GC:
1936           OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1937           break;
1938
1939           case PT_PC:
1940           OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1941           break;
1942
1943           case PT_SC:
1944           OK = prop->script == code[1 + IMM2_SIZE + 2];
1945           break;
1946
1947           /* These are specials for combination cases. */
1948
1949           case PT_ALNUM:
1950           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1951                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1952           break;
1953
1954           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1955           which means that Perl space and POSIX space are now identical. PCRE
1956           was changed at release 8.34. */
1957
1958           case PT_SPACE:    /* Perl space */
1959           case PT_PXSPACE:  /* POSIX space */
1960           switch(c)
1961             {
1962             HSPACE_CASES:
1963             VSPACE_CASES:
1964             OK = TRUE;
1965             break;
1966
1967             default:
1968             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1969             break;
1970             }
1971           break;
1972
1973           case PT_WORD:
1974           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1975                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1976                c == CHAR_UNDERSCORE;
1977           break;
1978
1979           case PT_CLIST:
1980           cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1981           for (;;)
1982             {
1983             if (c < *cp) { OK = FALSE; break; }
1984             if (c == *cp++) { OK = TRUE; break; }
1985             }
1986           break;
1987
1988           case PT_UCNC:
1989           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1990                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1991                c >= 0xe000;
1992           break;
1993
1994           /* Should never occur, but keep compilers from grumbling. */
1995
1996           default:
1997           OK = codevalue != OP_PROP;
1998           break;
1999           }
2000
2001         if (OK == (d == OP_PROP))
2002           {
2003           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2004             {
2005             active_count--;           /* Remove non-match possibility */
2006             next_active_state--;
2007             }
2008           if (++count >= (int)GET2(code, 1))
2009             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2010           else
2011             { ADD_NEW(state_offset, count); }
2012           }
2013         }
2014       break;
2015
2016       /*-----------------------------------------------------------------*/
2017       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2018       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2019       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2020       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2021       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2022         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2023       count = current_state->count;  /* Number already matched */
2024       if (clen > 0)
2025         {
2026         PCRE2_SPTR nptr;
2027         int ncount = 0;
2028         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2029           {
2030           active_count--;           /* Remove non-match possibility */
2031           next_active_state--;
2032           }
2033         nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2034           &ncount);
2035         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2036             reset_could_continue = TRUE;
2037         if (++count >= (int)GET2(code, 1))
2038           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2039         else
2040           { ADD_NEW_DATA(-state_offset, count, ncount); }
2041         }
2042       break;
2043 #endif
2044
2045       /*-----------------------------------------------------------------*/
2046       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2047       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2048       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2049       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2050       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2051         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2052       count = current_state->count;  /* Number already matched */
2053       if (clen > 0)
2054         {
2055         int ncount = 0;
2056         switch (c)
2057           {
2058           case CHAR_VT:
2059           case CHAR_FF:
2060           case CHAR_NEL:
2061 #ifndef EBCDIC
2062           case 0x2028:
2063           case 0x2029:
2064 #endif  /* Not EBCDIC */
2065           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2066           goto ANYNL03;
2067
2068           case CHAR_CR:
2069           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2070           /* Fall through */
2071
2072           ANYNL03:
2073           case CHAR_LF:
2074           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2075             {
2076             active_count--;           /* Remove non-match possibility */
2077             next_active_state--;
2078             }
2079           if (++count >= (int)GET2(code, 1))
2080             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2081           else
2082             { ADD_NEW_DATA(-state_offset, count, ncount); }
2083           break;
2084
2085           default:
2086           break;
2087           }
2088         }
2089       break;
2090
2091       /*-----------------------------------------------------------------*/
2092       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2093       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2094       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2095       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2096       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2097         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2098       count = current_state->count;  /* Number already matched */
2099       if (clen > 0)
2100         {
2101         BOOL OK;
2102         switch (c)
2103           {
2104           VSPACE_CASES:
2105           OK = TRUE;
2106           break;
2107
2108           default:
2109           OK = FALSE;
2110           }
2111
2112         if (OK == (d == OP_VSPACE))
2113           {
2114           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2115             {
2116             active_count--;           /* Remove non-match possibility */
2117             next_active_state--;
2118             }
2119           if (++count >= (int)GET2(code, 1))
2120             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2121           else
2122             { ADD_NEW_DATA(-state_offset, count, 0); }
2123           }
2124         }
2125       break;
2126
2127       /*-----------------------------------------------------------------*/
2128       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2129       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2130       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2131       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2132       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2133         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2134       count = current_state->count;  /* Number already matched */
2135       if (clen > 0)
2136         {
2137         BOOL OK;
2138         switch (c)
2139           {
2140           HSPACE_CASES:
2141           OK = TRUE;
2142           break;
2143
2144           default:
2145           OK = FALSE;
2146           break;
2147           }
2148
2149         if (OK == (d == OP_HSPACE))
2150           {
2151           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2152             {
2153             active_count--;           /* Remove non-match possibility */
2154             next_active_state--;
2155             }
2156           if (++count >= (int)GET2(code, 1))
2157             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2158           else
2159             { ADD_NEW_DATA(-state_offset, count, 0); }
2160           }
2161         }
2162       break;
2163
2164 /* ========================================================================== */
2165       /* These opcodes are followed by a character that is usually compared
2166       to the current subject character; it is loaded into d. We still get
2167       here even if there is no subject character, because in some cases zero
2168       repetitions are permitted. */
2169
2170       /*-----------------------------------------------------------------*/
2171       case OP_CHAR:
2172       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2173       break;
2174
2175       /*-----------------------------------------------------------------*/
2176       case OP_CHARI:
2177       if (clen == 0) break;
2178
2179 #ifdef SUPPORT_UNICODE
2180       if (utf)
2181         {
2182         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2183           {
2184           unsigned int othercase;
2185           if (c < 128)
2186             othercase = fcc[c];
2187           else
2188             othercase = UCD_OTHERCASE(c);
2189           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2190           }
2191         }
2192       else
2193 #endif  /* SUPPORT_UNICODE */
2194       /* Not UTF mode */
2195         {
2196         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2197           { ADD_NEW(state_offset + 2, 0); }
2198         }
2199       break;
2200
2201
2202 #ifdef SUPPORT_UNICODE
2203       /*-----------------------------------------------------------------*/
2204       /* This is a tricky one because it can match more than one character.
2205       Find out how many characters to skip, and then set up a negative state
2206       to wait for them to pass before continuing. */
2207
2208       case OP_EXTUNI:
2209       if (clen > 0)
2210         {
2211         int ncount = 0;
2212         PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2213           end_subject, utf, &ncount);
2214         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2215             reset_could_continue = TRUE;
2216         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2217         }
2218       break;
2219 #endif
2220
2221       /*-----------------------------------------------------------------*/
2222       /* This is a tricky like EXTUNI because it too can match more than one
2223       character (when CR is followed by LF). In this case, set up a negative
2224       state to wait for one character to pass before continuing. */
2225
2226       case OP_ANYNL:
2227       if (clen > 0) switch(c)
2228         {
2229         case CHAR_VT:
2230         case CHAR_FF:
2231         case CHAR_NEL:
2232 #ifndef EBCDIC
2233         case 0x2028:
2234         case 0x2029:
2235 #endif  /* Not EBCDIC */
2236         if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2237         /* Fall through */
2238
2239         case CHAR_LF:
2240         ADD_NEW(state_offset + 1, 0);
2241         break;
2242
2243         case CHAR_CR:
2244         if (ptr + 1 >= end_subject)
2245           {
2246           ADD_NEW(state_offset + 1, 0);
2247           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2248             reset_could_continue = TRUE;
2249           }
2250         else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2251           {
2252           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2253           }
2254         else
2255           {
2256           ADD_NEW(state_offset + 1, 0);
2257           }
2258         break;
2259         }
2260       break;
2261
2262       /*-----------------------------------------------------------------*/
2263       case OP_NOT_VSPACE:
2264       if (clen > 0) switch(c)
2265         {
2266         VSPACE_CASES:
2267         break;
2268
2269         default:
2270         ADD_NEW(state_offset + 1, 0);
2271         break;
2272         }
2273       break;
2274
2275       /*-----------------------------------------------------------------*/
2276       case OP_VSPACE:
2277       if (clen > 0) switch(c)
2278         {
2279         VSPACE_CASES:
2280         ADD_NEW(state_offset + 1, 0);
2281         break;
2282
2283         default:
2284         break;
2285         }
2286       break;
2287
2288       /*-----------------------------------------------------------------*/
2289       case OP_NOT_HSPACE:
2290       if (clen > 0) switch(c)
2291         {
2292         HSPACE_CASES:
2293         break;
2294
2295         default:
2296         ADD_NEW(state_offset + 1, 0);
2297         break;
2298         }
2299       break;
2300
2301       /*-----------------------------------------------------------------*/
2302       case OP_HSPACE:
2303       if (clen > 0) switch(c)
2304         {
2305         HSPACE_CASES:
2306         ADD_NEW(state_offset + 1, 0);
2307         break;
2308
2309         default:
2310         break;
2311         }
2312       break;
2313
2314       /*-----------------------------------------------------------------*/
2315       /* Match a negated single character casefully. */
2316
2317       case OP_NOT:
2318       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2319       break;
2320
2321       /*-----------------------------------------------------------------*/
2322       /* Match a negated single character caselessly. */
2323
2324       case OP_NOTI:
2325       if (clen > 0)
2326         {
2327         uint32_t otherd;
2328 #ifdef SUPPORT_UNICODE
2329         if (utf && d >= 128)
2330           otherd = UCD_OTHERCASE(d);
2331         else
2332 #endif  /* SUPPORT_UNICODE */
2333         otherd = TABLE_GET(d, fcc, d);
2334         if (c != d && c != otherd)
2335           { ADD_NEW(state_offset + dlen + 1, 0); }
2336         }
2337       break;
2338
2339       /*-----------------------------------------------------------------*/
2340       case OP_PLUSI:
2341       case OP_MINPLUSI:
2342       case OP_POSPLUSI:
2343       case OP_NOTPLUSI:
2344       case OP_NOTMINPLUSI:
2345       case OP_NOTPOSPLUSI:
2346       caseless = TRUE;
2347       codevalue -= OP_STARI - OP_STAR;
2348
2349       /* Fall through */
2350       case OP_PLUS:
2351       case OP_MINPLUS:
2352       case OP_POSPLUS:
2353       case OP_NOTPLUS:
2354       case OP_NOTMINPLUS:
2355       case OP_NOTPOSPLUS:
2356       count = current_state->count;  /* Already matched */
2357       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2358       if (clen > 0)
2359         {
2360         uint32_t otherd = NOTACHAR;
2361         if (caseless)
2362           {
2363 #ifdef SUPPORT_UNICODE
2364           if (utf && d >= 128)
2365             otherd = UCD_OTHERCASE(d);
2366           else
2367 #endif  /* SUPPORT_UNICODE */
2368           otherd = TABLE_GET(d, fcc, d);
2369           }
2370         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2371           {
2372           if (count > 0 &&
2373               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2374             {
2375             active_count--;             /* Remove non-match possibility */
2376             next_active_state--;
2377             }
2378           count++;
2379           ADD_NEW(state_offset, count);
2380           }
2381         }
2382       break;
2383
2384       /*-----------------------------------------------------------------*/
2385       case OP_QUERYI:
2386       case OP_MINQUERYI:
2387       case OP_POSQUERYI:
2388       case OP_NOTQUERYI:
2389       case OP_NOTMINQUERYI:
2390       case OP_NOTPOSQUERYI:
2391       caseless = TRUE;
2392       codevalue -= OP_STARI - OP_STAR;
2393       /* Fall through */
2394       case OP_QUERY:
2395       case OP_MINQUERY:
2396       case OP_POSQUERY:
2397       case OP_NOTQUERY:
2398       case OP_NOTMINQUERY:
2399       case OP_NOTPOSQUERY:
2400       ADD_ACTIVE(state_offset + dlen + 1, 0);
2401       if (clen > 0)
2402         {
2403         uint32_t otherd = NOTACHAR;
2404         if (caseless)
2405           {
2406 #ifdef SUPPORT_UNICODE
2407           if (utf && d >= 128)
2408             otherd = UCD_OTHERCASE(d);
2409           else
2410 #endif  /* SUPPORT_UNICODE */
2411           otherd = TABLE_GET(d, fcc, d);
2412           }
2413         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2414           {
2415           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2416             {
2417             active_count--;            /* Remove non-match possibility */
2418             next_active_state--;
2419             }
2420           ADD_NEW(state_offset + dlen + 1, 0);
2421           }
2422         }
2423       break;
2424
2425       /*-----------------------------------------------------------------*/
2426       case OP_STARI:
2427       case OP_MINSTARI:
2428       case OP_POSSTARI:
2429       case OP_NOTSTARI:
2430       case OP_NOTMINSTARI:
2431       case OP_NOTPOSSTARI:
2432       caseless = TRUE;
2433       codevalue -= OP_STARI - OP_STAR;
2434       /* Fall through */
2435       case OP_STAR:
2436       case OP_MINSTAR:
2437       case OP_POSSTAR:
2438       case OP_NOTSTAR:
2439       case OP_NOTMINSTAR:
2440       case OP_NOTPOSSTAR:
2441       ADD_ACTIVE(state_offset + dlen + 1, 0);
2442       if (clen > 0)
2443         {
2444         uint32_t otherd = NOTACHAR;
2445         if (caseless)
2446           {
2447 #ifdef SUPPORT_UNICODE
2448           if (utf && d >= 128)
2449             otherd = UCD_OTHERCASE(d);
2450           else
2451 #endif  /* SUPPORT_UNICODE */
2452           otherd = TABLE_GET(d, fcc, d);
2453           }
2454         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2455           {
2456           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2457             {
2458             active_count--;            /* Remove non-match possibility */
2459             next_active_state--;
2460             }
2461           ADD_NEW(state_offset, 0);
2462           }
2463         }
2464       break;
2465
2466       /*-----------------------------------------------------------------*/
2467       case OP_EXACTI:
2468       case OP_NOTEXACTI:
2469       caseless = TRUE;
2470       codevalue -= OP_STARI - OP_STAR;
2471       /* Fall through */
2472       case OP_EXACT:
2473       case OP_NOTEXACT:
2474       count = current_state->count;  /* Number already matched */
2475       if (clen > 0)
2476         {
2477         uint32_t otherd = NOTACHAR;
2478         if (caseless)
2479           {
2480 #ifdef SUPPORT_UNICODE
2481           if (utf && d >= 128)
2482             otherd = UCD_OTHERCASE(d);
2483           else
2484 #endif  /* SUPPORT_UNICODE */
2485           otherd = TABLE_GET(d, fcc, d);
2486           }
2487         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2488           {
2489           if (++count >= (int)GET2(code, 1))
2490             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2491           else
2492             { ADD_NEW(state_offset, count); }
2493           }
2494         }
2495       break;
2496
2497       /*-----------------------------------------------------------------*/
2498       case OP_UPTOI:
2499       case OP_MINUPTOI:
2500       case OP_POSUPTOI:
2501       case OP_NOTUPTOI:
2502       case OP_NOTMINUPTOI:
2503       case OP_NOTPOSUPTOI:
2504       caseless = TRUE;
2505       codevalue -= OP_STARI - OP_STAR;
2506       /* Fall through */
2507       case OP_UPTO:
2508       case OP_MINUPTO:
2509       case OP_POSUPTO:
2510       case OP_NOTUPTO:
2511       case OP_NOTMINUPTO:
2512       case OP_NOTPOSUPTO:
2513       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2514       count = current_state->count;  /* Number already matched */
2515       if (clen > 0)
2516         {
2517         uint32_t otherd = NOTACHAR;
2518         if (caseless)
2519           {
2520 #ifdef SUPPORT_UNICODE
2521           if (utf && d >= 128)
2522             otherd = UCD_OTHERCASE(d);
2523           else
2524 #endif  /* SUPPORT_UNICODE */
2525           otherd = TABLE_GET(d, fcc, d);
2526           }
2527         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2528           {
2529           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2530             {
2531             active_count--;             /* Remove non-match possibility */
2532             next_active_state--;
2533             }
2534           if (++count >= (int)GET2(code, 1))
2535             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2536           else
2537             { ADD_NEW(state_offset, count); }
2538           }
2539         }
2540       break;
2541
2542
2543 /* ========================================================================== */
2544       /* These are the class-handling opcodes */
2545
2546       case OP_CLASS:
2547       case OP_NCLASS:
2548       case OP_XCLASS:
2549         {
2550         BOOL isinclass = FALSE;
2551         int next_state_offset;
2552         PCRE2_SPTR ecode;
2553
2554         /* For a simple class, there is always just a 32-byte table, and we
2555         can set isinclass from it. */
2556
2557         if (codevalue != OP_XCLASS)
2558           {
2559           ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2560           if (clen > 0)
2561             {
2562             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2563               ((((uint8_t *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2564             }
2565           }
2566
2567         /* An extended class may have a table or a list of single characters,
2568         ranges, or both, and it may be positive or negative. There's a
2569         function that sorts all this out. */
2570
2571         else
2572          {
2573          ecode = code + GET(code, 1);
2574          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2575          }
2576
2577         /* At this point, isinclass is set for all kinds of class, and ecode
2578         points to the byte after the end of the class. If there is a
2579         quantifier, this is where it will be. */
2580
2581         next_state_offset = (int)(ecode - start_code);
2582
2583         switch (*ecode)
2584           {
2585           case OP_CRSTAR:
2586           case OP_CRMINSTAR:
2587           case OP_CRPOSSTAR:
2588           ADD_ACTIVE(next_state_offset + 1, 0);
2589           if (isinclass)
2590             {
2591             if (*ecode == OP_CRPOSSTAR)
2592               {
2593               active_count--;           /* Remove non-match possibility */
2594               next_active_state--;
2595               }
2596             ADD_NEW(state_offset, 0);
2597             }
2598           break;
2599
2600           case OP_CRPLUS:
2601           case OP_CRMINPLUS:
2602           case OP_CRPOSPLUS:
2603           count = current_state->count;  /* Already matched */
2604           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2605           if (isinclass)
2606             {
2607             if (count > 0 && *ecode == OP_CRPOSPLUS)
2608               {
2609               active_count--;           /* Remove non-match possibility */
2610               next_active_state--;
2611               }
2612             count++;
2613             ADD_NEW(state_offset, count);
2614             }
2615           break;
2616
2617           case OP_CRQUERY:
2618           case OP_CRMINQUERY:
2619           case OP_CRPOSQUERY:
2620           ADD_ACTIVE(next_state_offset + 1, 0);
2621           if (isinclass)
2622             {
2623             if (*ecode == OP_CRPOSQUERY)
2624               {
2625               active_count--;           /* Remove non-match possibility */
2626               next_active_state--;
2627               }
2628             ADD_NEW(next_state_offset + 1, 0);
2629             }
2630           break;
2631
2632           case OP_CRRANGE:
2633           case OP_CRMINRANGE:
2634           case OP_CRPOSRANGE:
2635           count = current_state->count;  /* Already matched */
2636           if (count >= (int)GET2(ecode, 1))
2637             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2638           if (isinclass)
2639             {
2640             int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2641
2642             if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2643               {
2644               active_count--;           /* Remove non-match possibility */
2645               next_active_state--;
2646               }
2647
2648             if (++count >= max && max != 0)   /* Max 0 => no limit */
2649               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2650             else
2651               { ADD_NEW(state_offset, count); }
2652             }
2653           break;
2654
2655           default:
2656           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2657           break;
2658           }
2659         }
2660       break;
2661
2662 /* ========================================================================== */
2663       /* These are the opcodes for fancy brackets of various kinds. We have
2664       to use recursion in order to handle them. The "always failing" assertion
2665       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2666       though the other "backtracking verbs" are not supported. */
2667
2668       case OP_FAIL:
2669       forced_fail++;    /* Count FAILs for multiple states */
2670       break;
2671
2672       case OP_ASSERT:
2673       case OP_ASSERT_NOT:
2674       case OP_ASSERTBACK:
2675       case OP_ASSERTBACK_NOT:
2676         {
2677         int rc;
2678         int *local_workspace;
2679         PCRE2_SIZE *local_offsets;
2680         PCRE2_SPTR endasscode = code + GET(code, 1);
2681         RWS_anchor *rws = (RWS_anchor *)RWS;
2682
2683         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2684           {
2685           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2686           if (rc != 0) return rc;
2687           RWS = (int *)rws;
2688           }
2689
2690         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2691         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2692         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2693
2694         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2695
2696         rc = internal_dfa_match(
2697           mb,                                   /* static match data */
2698           code,                                 /* this subexpression's code */
2699           ptr,                                  /* where we currently are */
2700           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2701           local_offsets,                        /* offset vector */
2702           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2703           local_workspace,                      /* workspace vector */
2704           RWS_RSIZE,                            /* size of same */
2705           rlevel,                               /* function recursion level */
2706           RWS);                                 /* recursion workspace */
2707
2708         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2709
2710         if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2711         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2712             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2713         }
2714       break;
2715
2716       /*-----------------------------------------------------------------*/
2717       case OP_COND:
2718       case OP_SCOND:
2719         {
2720         int codelink = (int)GET(code, 1);
2721         PCRE2_UCHAR condcode;
2722
2723         /* Because of the way auto-callout works during compile, a callout item
2724         is inserted between OP_COND and an assertion condition. This does not
2725         happen for the other conditions. */
2726
2727         if (code[LINK_SIZE + 1] == OP_CALLOUT
2728             || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2729           {
2730           PCRE2_SIZE callout_length;
2731           rrc = do_callout(code, offsets, current_subject, ptr, mb,
2732             1 + LINK_SIZE, &callout_length);
2733           if (rrc < 0) return rrc;                 /* Abandon */
2734           if (rrc > 0) break;                      /* Fail this thread */
2735           code += callout_length;                  /* Skip callout data */
2736           }
2737
2738         condcode = code[LINK_SIZE+1];
2739
2740         /* Back reference conditions and duplicate named recursion conditions
2741         are not supported */
2742
2743         if (condcode == OP_CREF || condcode == OP_DNCREF ||
2744             condcode == OP_DNRREF)
2745           return PCRE2_ERROR_DFA_UCOND;
2746
2747         /* The DEFINE condition is always false, and the assertion (?!) is
2748         converted to OP_FAIL. */
2749
2750         if (condcode == OP_FALSE || condcode == OP_FAIL)
2751           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2752
2753         /* There is also an always-true condition */
2754
2755         else if (condcode == OP_TRUE)
2756           { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2757
2758         /* The only supported version of OP_RREF is for the value RREF_ANY,
2759         which means "test if in any recursion". We can't test for specifically
2760         recursed groups. */
2761
2762         else if (condcode == OP_RREF)
2763           {
2764           unsigned int value = GET2(code, LINK_SIZE + 2);
2765           if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2766           if (mb->recursive != NULL)
2767             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2768           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2769           }
2770
2771         /* Otherwise, the condition is an assertion */
2772
2773         else
2774           {
2775           int rc;
2776           int *local_workspace;
2777           PCRE2_SIZE *local_offsets;
2778           PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2779           PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2780           RWS_anchor *rws = (RWS_anchor *)RWS;
2781
2782           if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2783             {
2784             rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2785             if (rc != 0) return rc;
2786             RWS = (int *)rws;
2787             }
2788
2789           local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2790           local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2791           rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2792
2793           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2794
2795           rc = internal_dfa_match(
2796             mb,                                   /* fixed match data */
2797             asscode,                              /* this subexpression's code */
2798             ptr,                                  /* where we currently are */
2799             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2800             local_offsets,                        /* offset vector */
2801             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2802             local_workspace,                      /* workspace vector */
2803             RWS_RSIZE,                            /* size of same */
2804             rlevel,                               /* function recursion level */
2805             RWS);                                 /* recursion workspace */
2806
2807           rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2808
2809           if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2810           if ((rc >= 0) ==
2811                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2812             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2813           else
2814             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2815           }
2816         }
2817       break;
2818
2819       /*-----------------------------------------------------------------*/
2820       case OP_RECURSE:
2821         {
2822         int rc;
2823         int *local_workspace;
2824         PCRE2_SIZE *local_offsets;
2825         RWS_anchor *rws = (RWS_anchor *)RWS;
2826         dfa_recursion_info *ri;
2827         PCRE2_SPTR callpat = start_code + GET(code, 1);
2828         uint32_t recno = (callpat == mb->start_code)? 0 :
2829           GET2(callpat, 1 + LINK_SIZE);
2830
2831         if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2832           {
2833           rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2834           if (rc != 0) return rc;
2835           RWS = (int *)rws;
2836           }
2837
2838         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2839         local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2840         rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2841
2842         /* Check for repeating a recursion without advancing the subject
2843         pointer. This should catch convoluted mutual recursions. (Some simple
2844         cases are caught at compile time.) */
2845
2846         for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2847           if (recno == ri->group_num && ptr == ri->subject_position)
2848             return PCRE2_ERROR_RECURSELOOP;
2849
2850         /* Remember this recursion and where we started it so as to
2851         catch infinite loops. */
2852
2853         new_recursive.group_num = recno;
2854         new_recursive.subject_position = ptr;
2855         new_recursive.prevrec = mb->recursive;
2856         mb->recursive = &new_recursive;
2857
2858         rc = internal_dfa_match(
2859           mb,                                   /* fixed match data */
2860           callpat,                              /* this subexpression's code */
2861           ptr,                                  /* where we currently are */
2862           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2863           local_offsets,                        /* offset vector */
2864           RWS_OVEC_RSIZE/OVEC_UNIT,             /* size of same */
2865           local_workspace,                      /* workspace vector */
2866           RWS_RSIZE,                            /* size of same */
2867           rlevel,                               /* function recursion level */
2868           RWS);                                 /* recursion workspace */
2869
2870         rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2871         mb->recursive = new_recursive.prevrec;  /* Done this recursion */
2872
2873         /* Ran out of internal offsets */
2874
2875         if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2876
2877         /* For each successful matched substring, set up the next state with a
2878         count of characters to skip before trying it. Note that the count is in
2879         characters, not bytes. */
2880
2881         if (rc > 0)
2882           {
2883           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2884             {
2885             PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2886 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2887             if (utf)
2888               {
2889               PCRE2_SPTR p = start_subject + local_offsets[rc];
2890               PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2891               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2892               }
2893 #endif
2894             if (charcount > 0)
2895               {
2896               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2897                 (int)(charcount - 1));
2898               }
2899             else
2900               {
2901               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2902               }
2903             }
2904           }
2905         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2906         }
2907       break;
2908
2909       /*-----------------------------------------------------------------*/
2910       case OP_BRAPOS:
2911       case OP_SBRAPOS:
2912       case OP_CBRAPOS:
2913       case OP_SCBRAPOS:
2914       case OP_BRAPOSZERO:
2915         {
2916         int rc;
2917         int *local_workspace;
2918         PCRE2_SIZE *local_offsets;
2919         PCRE2_SIZE charcount, matched_count;
2920         PCRE2_SPTR local_ptr = ptr;
2921         RWS_anchor *rws = (RWS_anchor *)RWS;
2922         BOOL allow_zero;
2923
2924         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2925           {
2926           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2927           if (rc != 0) return rc;
2928           RWS = (int *)rws;
2929           }
2930
2931         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2932         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2933         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2934
2935         if (codevalue == OP_BRAPOSZERO)
2936           {
2937           allow_zero = TRUE;
2938           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2939           }
2940         else allow_zero = FALSE;
2941
2942         /* Loop to match the subpattern as many times as possible as if it were
2943         a complete pattern. */
2944
2945         for (matched_count = 0;; matched_count++)
2946           {
2947           rc = internal_dfa_match(
2948             mb,                                   /* fixed match data */
2949             code,                                 /* this subexpression's code */
2950             local_ptr,                            /* where we currently are */
2951             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2952             local_offsets,                        /* offset vector */
2953             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2954             local_workspace,                      /* workspace vector */
2955             RWS_RSIZE,                            /* size of same */
2956             rlevel,                               /* function recursion level */
2957             RWS);                                 /* recursion workspace */
2958
2959           /* Failed to match */
2960
2961           if (rc < 0)
2962             {
2963             if (rc != PCRE2_ERROR_NOMATCH) return rc;
2964             break;
2965             }
2966
2967           /* Matched: break the loop if zero characters matched. */
2968
2969           charcount = local_offsets[1] - local_offsets[0];
2970           if (charcount == 0) break;
2971           local_ptr += charcount;    /* Advance temporary position ptr */
2972           }
2973
2974         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2975
2976         /* At this point we have matched the subpattern matched_count
2977         times, and local_ptr is pointing to the character after the end of the
2978         last match. */
2979
2980         if (matched_count > 0 || allow_zero)
2981           {
2982           PCRE2_SPTR end_subpattern = code;
2983           int next_state_offset;
2984
2985           do { end_subpattern += GET(end_subpattern, 1); }
2986             while (*end_subpattern == OP_ALT);
2987           next_state_offset =
2988             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2989
2990           /* Optimization: if there are no more active states, and there
2991           are no new states yet set up, then skip over the subject string
2992           right here, to save looping. Otherwise, set up the new state to swing
2993           into action when the end of the matched substring is reached. */
2994
2995           if (i + 1 >= active_count && new_count == 0)
2996             {
2997             ptr = local_ptr;
2998             clen = 0;
2999             ADD_NEW(next_state_offset, 0);
3000             }
3001           else
3002             {
3003             PCRE2_SPTR p = ptr;
3004             PCRE2_SPTR pp = local_ptr;
3005             charcount = (PCRE2_SIZE)(pp - p);
3006 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3007             if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3008 #endif
3009             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3010             }
3011           }
3012         }
3013       break;
3014
3015       /*-----------------------------------------------------------------*/
3016       case OP_ONCE:
3017         {
3018         int rc;
3019         int *local_workspace;
3020         PCRE2_SIZE *local_offsets;
3021         RWS_anchor *rws = (RWS_anchor *)RWS;
3022
3023         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3024           {
3025           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3026           if (rc != 0) return rc;
3027           RWS = (int *)rws;
3028           }
3029
3030         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3031         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3032         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3033
3034         rc = internal_dfa_match(
3035           mb,                                   /* fixed match data */
3036           code,                                 /* this subexpression's code */
3037           ptr,                                  /* where we currently are */
3038           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3039           local_offsets,                        /* offset vector */
3040           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3041           local_workspace,                      /* workspace vector */
3042           RWS_RSIZE,                            /* size of same */
3043           rlevel,                               /* function recursion level */
3044           RWS);                                 /* recursion workspace */
3045
3046         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3047
3048         if (rc >= 0)
3049           {
3050           PCRE2_SPTR end_subpattern = code;
3051           PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3052           int next_state_offset, repeat_state_offset;
3053
3054           do { end_subpattern += GET(end_subpattern, 1); }
3055             while (*end_subpattern == OP_ALT);
3056           next_state_offset =
3057             (int)(end_subpattern - start_code + LINK_SIZE + 1);
3058
3059           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3060           arrange for the repeat state also to be added to the relevant list.
3061           Calculate the offset, or set -1 for no repeat. */
3062
3063           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3064                                  *end_subpattern == OP_KETRMIN)?
3065             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3066
3067           /* If we have matched an empty string, add the next state at the
3068           current character pointer. This is important so that the duplicate
3069           checking kicks in, which is what breaks infinite loops that match an
3070           empty string. */
3071
3072           if (charcount == 0)
3073             {
3074             ADD_ACTIVE(next_state_offset, 0);
3075             }
3076
3077           /* Optimization: if there are no more active states, and there
3078           are no new states yet set up, then skip over the subject string
3079           right here, to save looping. Otherwise, set up the new state to swing
3080           into action when the end of the matched substring is reached. */
3081
3082           else if (i + 1 >= active_count && new_count == 0)
3083             {
3084             ptr += charcount;
3085             clen = 0;
3086             ADD_NEW(next_state_offset, 0);
3087
3088             /* If we are adding a repeat state at the new character position,
3089             we must fudge things so that it is the only current state.
3090             Otherwise, it might be a duplicate of one we processed before, and
3091             that would cause it to be skipped. */
3092
3093             if (repeat_state_offset >= 0)
3094               {
3095               next_active_state = active_states;
3096               active_count = 0;
3097               i = -1;
3098               ADD_ACTIVE(repeat_state_offset, 0);
3099               }
3100             }
3101           else
3102             {
3103 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3104             if (utf)
3105               {
3106               PCRE2_SPTR p = start_subject + local_offsets[0];
3107               PCRE2_SPTR pp = start_subject + local_offsets[1];
3108               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3109               }
3110 #endif
3111             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3112             if (repeat_state_offset >= 0)
3113               { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3114             }
3115           }
3116         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3117         }
3118       break;
3119
3120
3121 /* ========================================================================== */
3122       /* Handle callouts */
3123
3124       case OP_CALLOUT:
3125       case OP_CALLOUT_STR:
3126         {
3127         PCRE2_SIZE callout_length;
3128         rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
3129           &callout_length);
3130         if (rrc < 0) return rrc;   /* Abandon */
3131         if (rrc == 0)
3132           { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3133         }
3134       break;
3135
3136
3137 /* ========================================================================== */
3138       default:        /* Unsupported opcode */
3139       return PCRE2_ERROR_DFA_UITEM;
3140       }
3141
3142     NEXT_ACTIVE_STATE: continue;
3143
3144     }      /* End of loop scanning active states */
3145
3146   /* We have finished the processing at the current subject character. If no
3147   new states have been set for the next character, we have found all the
3148   matches that we are going to find. If we are at the top level and partial
3149   matching has been requested, check for appropriate conditions.
3150
3151   The "forced_ fail" variable counts the number of (*F) encountered for the
3152   character. If it is equal to the original active_count (saved in
3153   workspace[1]) it means that (*F) was found on every active state. In this
3154   case we don't want to give a partial match.
3155
3156   The "could_continue" variable is true if a state could have continued but
3157   for the fact that the end of the subject was reached. */
3158
3159   if (new_count <= 0)
3160     {
3161     if (rlevel == 1 &&                               /* Top level, and */
3162         could_continue &&                            /* Some could go on, and */
3163         forced_fail != workspace[1] &&               /* Not all forced fail & */
3164         (                                            /* either... */
3165         (mb->moptions & PCRE2_PARTIAL_HARD) != 0      /* Hard partial */
3166         ||                                           /* or... */
3167         ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3168          match_count < 0)                            /* no matches */
3169         ) &&                                         /* And... */
3170         (
3171         partial_newline ||                           /* Either partial NL */
3172           (                                          /* or ... */
3173           ptr >= end_subject &&                /* End of subject and */
3174           ptr > mb->start_used_ptr)            /* Inspected non-empty string */
3175           )
3176         )
3177       match_count = PCRE2_ERROR_PARTIAL;
3178     break;  /* Exit from loop along the subject string */
3179     }
3180
3181   /* One or more states are active for the next character. */
3182
3183   ptr += clen;    /* Advance to next subject character */
3184   }               /* Loop to move along the subject string */
3185
3186 /* Control gets here from "break" a few lines above. If we have a match and
3187 PCRE2_ENDANCHORED is set, the match fails. */
3188
3189 if (match_count >= 0 &&
3190     ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3191     ptr < end_subject)
3192   match_count = PCRE2_ERROR_NOMATCH;
3193
3194 return match_count;
3195 }
3196
3197
3198
3199 /*************************************************
3200 *     Match a pattern using the DFA algorithm    *
3201 *************************************************/
3202
3203 /* This function matches a compiled pattern to a subject string, using the
3204 alternate matching algorithm that finds all matches at once.
3205
3206 Arguments:
3207   code          points to the compiled pattern
3208   subject       subject string
3209   length        length of subject string
3210   startoffset   where to start matching in the subject
3211   options       option bits
3212   match_data    points to a match data structure
3213   gcontext      points to a match context
3214   workspace     pointer to workspace
3215   wscount       size of workspace
3216
3217 Returns:        > 0 => number of match offset pairs placed in offsets
3218                 = 0 => offsets overflowed; longest matches are present
3219                  -1 => failed to match
3220                < -1 => some kind of unexpected problem
3221 */
3222
3223 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
3224 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3225   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3226   pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3227 {
3228 int rc;
3229 const pcre2_real_code *re = (const pcre2_real_code *)code;
3230
3231 PCRE2_SPTR start_match;
3232 PCRE2_SPTR end_subject;
3233 PCRE2_SPTR bumpalong_limit;
3234 PCRE2_SPTR req_cu_ptr;
3235
3236 BOOL utf, anchored, startline, firstline;
3237 BOOL has_first_cu = FALSE;
3238 BOOL has_req_cu = FALSE;
3239
3240 PCRE2_UCHAR first_cu = 0;
3241 PCRE2_UCHAR first_cu2 = 0;
3242 PCRE2_UCHAR req_cu = 0;
3243 PCRE2_UCHAR req_cu2 = 0;
3244
3245 const uint8_t *start_bits = NULL;
3246
3247 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3248 is used below, and it expects NLBLOCK to be defined as a pointer. */
3249
3250 pcre2_callout_block cb;
3251 dfa_match_block actual_match_block;
3252 dfa_match_block *mb = &actual_match_block;
3253
3254 /* Set up a starting block of memory for use during recursive calls to
3255 internal_dfa_match(). By putting this on the stack, it minimizes resource use
3256 in the case when it is not needed. If this is too small, more memory is
3257 obtained from the heap. At the start of each block is an anchor structure.*/
3258
3259 int base_recursion_workspace[RWS_BASE_SIZE];
3260 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3261 rws->next = NULL;
3262 rws->size = RWS_BASE_SIZE;
3263 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3264
3265 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3266 subject string. */
3267
3268 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
3269
3270 /* Plausibility checks */
3271
3272 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3273 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3274   return PCRE2_ERROR_NULL;
3275 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3276 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3277
3278 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3279 time. */
3280
3281 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3282    ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3283   return PCRE2_ERROR_BADOPTION;
3284
3285 /* Check that the first field in the block is the magic number. If it is not,
3286 return with PCRE2_ERROR_BADMAGIC. */
3287
3288 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3289
3290 /* Check the code unit width. */
3291
3292 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3293   return PCRE2_ERROR_BADMODE;
3294
3295 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3296 options variable for this function. Users of PCRE2 who are not calling the
3297 function directly would like to have a way of setting these flags, in the same
3298 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3299 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3300 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3301 transferred to the options for this function. The bits are guaranteed to be
3302 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3303 that the match-time bits are not more significant than the flag bits. If by
3304 accident this is not the case, a compile-time division by zero error will
3305 occur. */
3306
3307 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3308 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3309 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3310 #undef FF
3311 #undef OO
3312
3313 /* If restarting after a partial match, do some sanity checks on the contents
3314 of the workspace. */
3315
3316 if ((options & PCRE2_DFA_RESTART) != 0)
3317   {
3318   if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3319     workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3320       return PCRE2_ERROR_DFA_BADRESTART;
3321   }
3322
3323 /* Set some local values */
3324
3325 utf = (re->overall_options & PCRE2_UTF) != 0;
3326 start_match = subject + start_offset;
3327 end_subject = subject + length;
3328 req_cu_ptr = start_match - 1;
3329 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3330   (re->overall_options & PCRE2_ANCHORED) != 0;
3331
3332 /* The "must be at the start of a line" flags are used in a loop when finding
3333 where to start. */
3334
3335 startline = (re->flags & PCRE2_STARTLINE) != 0;
3336 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3337 bumpalong_limit = end_subject;
3338
3339 /* Initialize and set up the fixed fields in the callout block, with a pointer
3340 in the match block. */
3341
3342 mb->cb = &cb;
3343 cb.version = 2;
3344 cb.subject = subject;
3345 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3346 cb.callout_flags = 0;
3347 cb.capture_top      = 1;      /* No capture support */
3348 cb.capture_last     = 0;
3349 cb.mark             = NULL;   /* No (*MARK) support */
3350
3351 /* Get data from the match context, if present, and fill in the remaining
3352 fields in the match block. It is an error to set an offset limit without
3353 setting the flag at compile time. */
3354
3355 if (mcontext == NULL)
3356   {
3357   mb->callout = NULL;
3358   mb->memctl = re->memctl;
3359   mb->match_limit = PRIV(default_match_context).match_limit;
3360   mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3361   mb->heap_limit = PRIV(default_match_context).heap_limit;
3362   }
3363 else
3364   {
3365   if (mcontext->offset_limit != PCRE2_UNSET)
3366     {
3367     if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3368       return PCRE2_ERROR_BADOFFSETLIMIT;
3369     bumpalong_limit = subject + mcontext->offset_limit;
3370     }
3371   mb->callout = mcontext->callout;
3372   mb->callout_data = mcontext->callout_data;
3373   mb->memctl = mcontext->memctl;
3374   mb->match_limit = mcontext->match_limit;
3375   mb->match_limit_depth = mcontext->depth_limit;
3376   mb->heap_limit = mcontext->heap_limit;
3377   }
3378
3379 if (mb->match_limit > re->limit_match)
3380   mb->match_limit = re->limit_match;
3381
3382 if (mb->match_limit_depth > re->limit_depth)
3383   mb->match_limit_depth = re->limit_depth;
3384
3385 if (mb->heap_limit > re->limit_heap)
3386   mb->heap_limit = re->limit_heap;
3387
3388 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3389   re->name_count * re->name_entry_size;
3390 mb->tables = re->tables;
3391 mb->start_subject = subject;
3392 mb->end_subject = end_subject;
3393 mb->start_offset = start_offset;
3394 mb->moptions = options;
3395 mb->poptions = re->overall_options;
3396 mb->match_call_count = 0;
3397 mb->heap_used = 0;
3398
3399 /* Process the \R and newline settings. */
3400
3401 mb->bsr_convention = re->bsr_convention;
3402 mb->nltype = NLTYPE_FIXED;
3403 switch(re->newline_convention)
3404   {
3405   case PCRE2_NEWLINE_CR:
3406   mb->nllen = 1;
3407   mb->nl[0] = CHAR_CR;
3408   break;
3409
3410   case PCRE2_NEWLINE_LF:
3411   mb->nllen = 1;
3412   mb->nl[0] = CHAR_NL;
3413   break;
3414
3415   case PCRE2_NEWLINE_NUL:
3416   mb->nllen = 1;
3417   mb->nl[0] = CHAR_NUL;
3418   break;
3419
3420   case PCRE2_NEWLINE_CRLF:
3421   mb->nllen = 2;
3422   mb->nl[0] = CHAR_CR;
3423   mb->nl[1] = CHAR_NL;
3424   break;
3425
3426   case PCRE2_NEWLINE_ANY:
3427   mb->nltype = NLTYPE_ANY;
3428   break;
3429
3430   case PCRE2_NEWLINE_ANYCRLF:
3431   mb->nltype = NLTYPE_ANYCRLF;
3432   break;
3433
3434   default: return PCRE2_ERROR_INTERNAL;
3435   }
3436
3437 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3438 we must also check that a starting offset does not point into the middle of a
3439 multiunit character. We check only the portion of the subject that is going to
3440 be inspected during matching - from the offset minus the maximum back reference
3441 to the given length. This saves time when a small part of a large subject is
3442 being matched by the use of a starting offset. Note that the maximum lookbehind
3443 is a number of characters, not code units. */
3444
3445 #ifdef SUPPORT_UNICODE
3446 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3447   {
3448   PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
3449
3450   if (start_offset > 0)
3451     {
3452 #if PCRE2_CODE_UNIT_WIDTH != 32
3453     unsigned int i;
3454     if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3455       return PCRE2_ERROR_BADUTFOFFSET;
3456     for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3457       {
3458       check_subject--;
3459       while (check_subject > subject &&
3460 #if PCRE2_CODE_UNIT_WIDTH == 8
3461       (*check_subject & 0xc0) == 0x80)
3462 #else  /* 16-bit */
3463       (*check_subject & 0xfc00) == 0xdc00)
3464 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3465         check_subject--;
3466       }
3467 #else   /* In the 32-bit library, one code unit equals one character. */
3468     check_subject -= re->max_lookbehind;
3469     if (check_subject < subject) check_subject = subject;
3470 #endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
3471     }
3472
3473   /* Validate the relevant portion of the subject. After an error, adjust the
3474   offset to be an absolute offset in the whole string. */
3475
3476   match_data->rc = PRIV(valid_utf)(check_subject,
3477     length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3478   if (match_data->rc != 0)
3479     {
3480     match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3481     return match_data->rc;
3482     }
3483   }
3484 #endif  /* SUPPORT_UNICODE */
3485
3486 /* Set up the first code unit to match, if available. If there's no first code
3487 unit there may be a bitmap of possible first characters. */
3488
3489 if ((re->flags & PCRE2_FIRSTSET) != 0)
3490   {
3491   has_first_cu = TRUE;
3492   first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3493   if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3494     {
3495     first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3496 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3497     if (utf && first_cu > 127)
3498       first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3499 #endif
3500     }
3501   }
3502 else
3503   if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3504     start_bits = re->start_bitmap;
3505
3506 /* There may be a "last known required code unit" set. */
3507
3508 if ((re->flags & PCRE2_LASTSET) != 0)
3509   {
3510   has_req_cu = TRUE;
3511   req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3512   if ((re->flags & PCRE2_LASTCASELESS) != 0)
3513     {
3514     req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3515 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3516     if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3517 #endif
3518     }
3519   }
3520
3521 /* Fill in fields that are always returned in the match data. */
3522
3523 match_data->code = re;
3524 match_data->subject = subject;
3525 match_data->mark = NULL;
3526 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3527
3528 /* Call the main matching function, looping for a non-anchored regex after a
3529 failed match. If not restarting, perform certain optimizations at the start of
3530 a match. */
3531
3532 for (;;)
3533   {
3534   /* ----------------- Start of match optimizations ---------------- */
3535
3536   /* There are some optimizations that avoid running the match if a known
3537   starting point is not found, or if a known later code unit is not present.
3538   However, there is an option (settable at compile time) that disables
3539   these, for testing and for ensuring that all callouts do actually occur.
3540   The optimizations must also be avoided when restarting a DFA match. */
3541
3542   if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3543       (options & PCRE2_DFA_RESTART) == 0)
3544     {
3545     /* If firstline is TRUE, the start of the match is constrained to the first
3546     line of a multiline string. That is, the match must be before or at the
3547     first newline following the start of matching. Temporarily adjust
3548     end_subject so that we stop the optimization scans for a first code unit
3549     immediately after the first character of a newline (the first code unit can
3550     legitimately be a newline). If the match fails at the newline, later code
3551     breaks this loop. */
3552
3553     if (firstline)
3554       {
3555       PCRE2_SPTR t = start_match;
3556 #ifdef SUPPORT_UNICODE
3557       if (utf)
3558         {
3559         while (t < end_subject && !IS_NEWLINE(t))
3560           {
3561           t++;
3562           ACROSSCHAR(t < end_subject, t, t++);
3563           }
3564         }
3565       else
3566 #endif
3567       while (t < end_subject && !IS_NEWLINE(t)) t++;
3568       end_subject = t;
3569       }
3570
3571     /* Anchored: check the first code unit if one is recorded. This may seem
3572     pointless but it can help in detecting a no match case without scanning for
3573     the required code unit. */
3574
3575     if (anchored)
3576       {
3577       if (has_first_cu || start_bits != NULL)
3578         {
3579         BOOL ok = start_match < end_subject;
3580         if (ok)
3581           {
3582           PCRE2_UCHAR c = UCHAR21TEST(start_match);
3583           ok = has_first_cu && (c == first_cu || c == first_cu2);
3584           if (!ok && start_bits != NULL)
3585             {
3586 #if PCRE2_CODE_UNIT_WIDTH != 8
3587             if (c > 255) c = 255;
3588 #endif
3589             ok = (start_bits[c/8] & (1 << (c&7))) != 0;
3590             }
3591           }
3592         if (!ok) break;
3593         }
3594       }
3595
3596     /* Not anchored. Advance to a unique first code unit if there is one. In
3597     8-bit mode, the use of memchr() gives a big speed up, even though we have
3598     to call it twice in caseless mode, in order to find the earliest occurrence
3599     of the character in either of its cases. */
3600
3601     else
3602       {
3603       if (has_first_cu)
3604         {
3605         if (first_cu != first_cu2)  /* Caseless */
3606           {
3607 #if PCRE2_CODE_UNIT_WIDTH != 8
3608           PCRE2_UCHAR smc;
3609           while (start_match < end_subject &&
3610                 (smc = UCHAR21TEST(start_match)) != first_cu &&
3611                   smc != first_cu2)
3612             start_match++;
3613 #else  /* 8-bit code units */
3614           PCRE2_SPTR pp1 =
3615             memchr(start_match, first_cu, end_subject-start_match);
3616           PCRE2_SPTR pp2 =
3617             memchr(start_match, first_cu2, end_subject-start_match);
3618           if (pp1 == NULL)
3619             start_match = (pp2 == NULL)? end_subject : pp2;
3620           else
3621             start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3622 #endif
3623           }
3624
3625         /* The caseful case */
3626
3627         else
3628           {
3629 #if PCRE2_CODE_UNIT_WIDTH != 8
3630           while (start_match < end_subject && UCHAR21TEST(start_match) !=
3631                  first_cu)
3632             start_match++;
3633 #else
3634           start_match = memchr(start_match, first_cu, end_subject - start_match);
3635           if (start_match == NULL) start_match = end_subject;
3636 #endif
3637           }
3638
3639         /* If we can't find the required code unit, having reached the true end
3640         of the subject, break the bumpalong loop, to force a match failure,
3641         except when doing partial matching, when we let the next cycle run at
3642         the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3643         which partially matches "abc", even though the string does not contain
3644         the starting character "d". If we have not reached the true end of the
3645         subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3646         we also let the cycle run, because the matching string is legitimately
3647         allowed to start with the first code unit of a newline. */
3648
3649         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3650             start_match >= mb->end_subject)
3651           break;
3652         }
3653
3654       /* If there's no first code unit, advance to just after a linebreak for a
3655       multiline match if required. */
3656
3657       else if (startline)
3658         {
3659         if (start_match > mb->start_subject + start_offset)
3660           {
3661 #ifdef SUPPORT_UNICODE
3662           if (utf)
3663             {
3664             while (start_match < end_subject && !WAS_NEWLINE(start_match))
3665               {
3666               start_match++;
3667               ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3668               }
3669             }
3670           else
3671 #endif
3672           while (start_match < end_subject && !WAS_NEWLINE(start_match))
3673             start_match++;
3674
3675           /* If we have just passed a CR and the newline option is ANY or
3676           ANYCRLF, and we are now at a LF, advance the match position by one
3677           more code unit. */
3678
3679           if (start_match[-1] == CHAR_CR &&
3680                (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3681                start_match < end_subject &&
3682                UCHAR21TEST(start_match) == CHAR_NL)
3683             start_match++;
3684           }
3685         }
3686
3687       /* If there's no first code unit or a requirement for a multiline line
3688       start, advance to a non-unique first code unit if any have been
3689       identified. The bitmap contains only 256 bits. When code units are 16 or
3690       32 bits wide, all code units greater than 254 set the 255 bit. */
3691
3692       else if (start_bits != NULL)
3693         {
3694         while (start_match < end_subject)
3695           {
3696           uint32_t c = UCHAR21TEST(start_match);
3697 #if PCRE2_CODE_UNIT_WIDTH != 8
3698           if (c > 255) c = 255;
3699 #endif
3700           if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
3701           start_match++;
3702           }
3703
3704         /* See comment above in first_cu checking about the next line. */
3705
3706         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3707             start_match >= mb->end_subject)
3708           break;
3709         }
3710       }  /* End of first code unit handling */
3711
3712     /* Restore fudged end_subject */
3713
3714     end_subject = mb->end_subject;
3715
3716     /* The following two optimizations are disabled for partial matching. */
3717
3718     if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3719       {
3720       /* The minimum matching length is a lower bound; no actual string of that
3721       length may actually match the pattern. Although the value is, strictly,
3722       in characters, we treat it as code units to avoid spending too much time
3723       in this optimization. */
3724
3725       if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3726
3727       /* If req_cu is set, we know that that code unit must appear in the
3728       subject for the match to succeed. If the first code unit is set, req_cu
3729       must be later in the subject; otherwise the test starts at the match
3730       point. This optimization can save a huge amount of backtracking in
3731       patterns with nested unlimited repeats that aren't going to match.
3732       Writing separate code for cased/caseless versions makes it go faster, as
3733       does using an autoincrement and backing off on a match.
3734
3735       HOWEVER: when the subject string is very, very long, searching to its end
3736       can take a long time, and give bad performance on quite ordinary
3737       patterns. This showed up when somebody was matching something like
3738       /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3739       sufficiently long. */
3740
3741       if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
3742         {
3743         PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
3744
3745         /* We don't need to repeat the search if we haven't yet reached the
3746         place we found it at last time. */
3747
3748         if (p > req_cu_ptr)
3749           {
3750           if (req_cu != req_cu2)
3751             {
3752             while (p < end_subject)
3753               {
3754               uint32_t pp = UCHAR21INCTEST(p);
3755               if (pp == req_cu || pp == req_cu2) { p--; break; }
3756               }
3757             }
3758           else
3759             {
3760             while (p < end_subject)
3761               {
3762               if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3763               }
3764             }
3765
3766           /* If we can't find the required code unit, break the matching loop,
3767           forcing a match failure. */
3768
3769           if (p >= end_subject) break;
3770
3771           /* If we have found the required code unit, save the point where we
3772           found it, so that we don't search again next time round the loop if
3773           the start hasn't passed this code unit yet. */
3774
3775           req_cu_ptr = p;
3776           }
3777         }
3778       }
3779     }
3780
3781   /* ------------ End of start of match optimizations ------------ */
3782
3783   /* Give no match if we have passed the bumpalong limit. */
3784
3785   if (start_match > bumpalong_limit) break;
3786
3787   /* OK, now we can do the business */
3788
3789   mb->start_used_ptr = start_match;
3790   mb->last_used_ptr = start_match;
3791   mb->recursive = NULL;
3792
3793   rc = internal_dfa_match(
3794     mb,                           /* fixed match data */
3795     mb->start_code,               /* this subexpression's code */
3796     start_match,                  /* where we currently are */
3797     start_offset,                 /* start offset in subject */
3798     match_data->ovector,          /* offset vector */
3799     (uint32_t)match_data->oveccount * 2,  /* actual size of same */
3800     workspace,                    /* workspace vector */
3801     (int)wscount,                 /* size of same */
3802     0,                            /* function recurse level */
3803     base_recursion_workspace);    /* initial workspace for recursion */
3804
3805   /* Anything other than "no match" means we are done, always; otherwise, carry
3806   on only if not anchored. */
3807
3808   if (rc != PCRE2_ERROR_NOMATCH || anchored)
3809     {
3810     if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3811       {
3812       match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3813       match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3814       }
3815     match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3816     match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3817     match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3818     match_data->rc = rc;
3819     goto EXIT;
3820     }
3821
3822   /* Advance to the next subject character unless we are at the end of a line
3823   and firstline is set. */
3824
3825   if (firstline && IS_NEWLINE(start_match)) break;
3826   start_match++;
3827 #ifdef SUPPORT_UNICODE
3828   if (utf)
3829     {
3830     ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3831     }
3832 #endif
3833   if (start_match > end_subject) break;
3834
3835   /* If we have just passed a CR and we are now at a LF, and the pattern does
3836   not contain any explicit matches for \r or \n, and the newline option is CRLF
3837   or ANY or ANYCRLF, advance the match position by one more character. */
3838
3839   if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3840       start_match < end_subject &&
3841       UCHAR21TEST(start_match) == CHAR_NL &&
3842       (re->flags & PCRE2_HASCRORLF) == 0 &&
3843         (mb->nltype == NLTYPE_ANY ||
3844          mb->nltype == NLTYPE_ANYCRLF ||
3845          mb->nllen == 2))
3846     start_match++;
3847
3848   }   /* "Bumpalong" loop */
3849
3850 NOMATCH_EXIT:
3851 rc = PCRE2_ERROR_NOMATCH;
3852
3853 EXIT:
3854 while (rws->next != NULL)
3855   {
3856   RWS_anchor *next = rws->next;
3857   rws->next = next->next;
3858   mb->memctl.free(next, mb->memctl.memory_data);
3859   }
3860
3861 return rc;
3862 }
3863
3864 /* End of pcre2_dfa_match.c */