1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2018 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
46 #include "pcre2_internal.h"
48 #define PTR_STACK_SIZE 20
50 #define SUBSTITUTE_OPTIONS \
51 (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
52 PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_UNKNOWN_UNSET| \
53 PCRE2_SUBSTITUTE_UNSET_EMPTY)
57 /*************************************************
58 * Find end of substitute text *
59 *************************************************/
61 /* In extended mode, we recognize ${name:+set text:unset text} and similar
62 constructions. This requires the identification of unescaped : and }
63 characters. This function scans for such. It must deal with nested ${
64 constructions. The pointer to the text is updated, either to the required end
65 character, or to where an error was detected.
68 code points to the compiled expression (for options)
69 ptrptr points to the pointer to the start of the text (updated)
70 ptrend end of the whole string
71 last TRUE if the last expected string (only } recognized)
74 negative error code on failure
78 find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
82 uint32_t nestlevel = 0;
84 PCRE2_SPTR ptr = *ptrptr;
86 for (; ptr < ptrend; ptr++)
90 if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
97 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
99 if (nestlevel == 0) goto EXIT;
103 else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
105 else if (*ptr == CHAR_DOLLAR_SIGN)
107 if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
114 else if (*ptr == CHAR_BACKSLASH)
120 if (ptr < ptrend - 1) switch (ptr[1])
130 ptr += 1; /* Must point after \ */
131 erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
132 code->overall_options, FALSE, NULL);
133 ptr -= 1; /* Back to last code unit of escape */
142 case 0: /* Data character */
143 case ESC_E: /* Isolated \E is ignored */
151 rc = PCRE2_ERROR_BADREPESCAPE;
157 rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */
166 /*************************************************
167 * Match and substitute *
168 *************************************************/
170 /* This function applies a compiled re to a subject string and creates a new
171 string with substitutions. The first 7 arguments are the same as for
172 pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
175 code points to the compiled expression
176 subject points to the subject string
177 length length of subject string (may contain binary zeros)
178 start_offset where to start in the subject string
180 match_data points to a match_data block, or is NULL
181 context points a PCRE2 context
182 replacement points to the replacement string
183 rlength length of replacement string
184 buffer where to put the substituted string
185 blength points to length of buffer; updated to length of string
187 Returns: >= 0 number of substitutions made
189 PCRE2_ERROR_BADREPLACEMENT means invalid use of $
192 /* This macro checks for space in the buffer before copying into it. On
193 overflow, either give an error immediately, or keep on, accumulating the
196 #define CHECKMEMCPY(from,length) \
197 if (!overflowed && lengthleft < length) \
199 if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
201 extra_needed = length - lengthleft; \
203 else if (overflowed) \
205 extra_needed += length; \
209 memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
210 buff_offset += length; \
211 lengthleft -= length; \
214 /* Here's the function */
216 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
217 pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
218 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
219 pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
220 PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
225 int forcecasereset = 0;
226 uint32_t ovector_count;
227 uint32_t goptions = 0;
229 BOOL match_data_created = FALSE;
230 BOOL literal = FALSE;
231 BOOL overflowed = FALSE;
232 #ifdef SUPPORT_UNICODE
233 BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
238 PCRE2_SIZE extra_needed = 0;
239 PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
241 PCRE2_SIZE ovecsave[3];
244 lengthleft = buff_length = *blength;
245 *blength = PCRE2_UNSET;
246 ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
248 /* Partial matching is not valid. */
250 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
251 return PCRE2_ERROR_BADOPTION;
253 /* If no match data block is provided, create one. */
255 if (match_data == NULL)
257 pcre2_general_context *gcontext = (mcontext == NULL)?
258 (pcre2_general_context *)code :
259 (pcre2_general_context *)mcontext;
260 match_data = pcre2_match_data_create_from_pattern(code, gcontext);
261 if (match_data == NULL) return PCRE2_ERROR_NOMEMORY;
262 match_data_created = TRUE;
264 ovector = pcre2_get_ovector_pointer(match_data);
265 ovector_count = pcre2_get_ovector_count(match_data);
267 /* Find lengths of zero-terminated strings and the end of the replacement. */
269 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
270 if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
271 repend = replacement + rlength;
273 /* Check UTF replacement string if necessary. */
275 #ifdef SUPPORT_UNICODE
276 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
278 rc = PRIV(valid_utf)(replacement, rlength, &(match_data->rightchar));
281 match_data->leftchar = 0;
285 #endif /* SUPPORT_UNICODE */
287 /* Save the substitute options and remove them from the match options. */
289 suboptions = options & SUBSTITUTE_OPTIONS;
290 options &= ~SUBSTITUTE_OPTIONS;
292 /* Copy up to the start offset */
294 if (start_offset > length)
296 match_data->leftchar = 0;
297 rc = PCRE2_ERROR_BADOFFSET;
300 CHECKMEMCPY(subject, start_offset);
302 /* Loop for global substituting. */
307 PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
308 uint32_t ptrstackptr = 0;
310 rc = pcre2_match(code, subject, length, start_offset, options|goptions,
311 match_data, mcontext);
313 #ifdef SUPPORT_UNICODE
314 if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */
317 /* Any error other than no match returns the error code. No match when not
318 doing the special after-empty-match global rematch, or when at the end of the
319 subject, breaks the global loop. Otherwise, advance the starting point by one
320 character, copying it to the output, and try again. */
324 PCRE2_SIZE save_start;
326 if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
327 if (goptions == 0 || start_offset >= length) break;
329 /* Advance by one code point. Then, if CRLF is a valid newline sequence and
330 we have advanced into the middle of it, advance one more code point. In
331 other words, do not start in the middle of CRLF, even if CR and LF on their
332 own are valid newlines. */
334 save_start = start_offset++;
335 if (subject[start_offset-1] == CHAR_CR &&
336 code->newline_convention != PCRE2_NEWLINE_CR &&
337 code->newline_convention != PCRE2_NEWLINE_LF &&
338 start_offset < length &&
339 subject[start_offset] == CHAR_LF)
342 /* Otherwise, in UTF mode, advance past any secondary code points. */
344 else if ((code->overall_options & PCRE2_UTF) != 0)
346 #if PCRE2_CODE_UNIT_WIDTH == 8
347 while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
349 #elif PCRE2_CODE_UNIT_WIDTH == 16
350 while (start_offset < length &&
351 (subject[start_offset] & 0xfc00) == 0xdc00)
356 /* Copy what we have advanced past, reset the special global options, and
357 continue to the next match. */
359 fraglength = start_offset - save_start;
360 CHECKMEMCPY(subject + save_start, fraglength);
365 /* Handle a successful match. Matches that use \K to end before they start
366 or start before the current point in the subject are not supported. */
368 if (ovector[1] < ovector[0] || ovector[0] < start_offset)
370 rc = PCRE2_ERROR_BADSUBSPATTERN;
374 /* Check for the same match as previous. This is legitimate after matching an
375 empty string that starts after the initial match offset. We have tried again
376 at the match point in case the pattern is one like /(?<=\G.)/ which can never
377 match at its starting point, so running the match achieves the bumpalong. If
378 we do get the same (null) match at the original match point, it isn't such a
379 pattern, so we now do the empty string magic. In all other cases, a repeat
380 match should never occur. */
382 if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
384 if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
386 goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
387 ovecsave[2] = start_offset;
388 continue; /* Back to the top of the loop */
390 rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
394 /* Count substitutions with a paranoid check for integer overflow; surely no
395 real call to this function would ever hit this! */
399 rc = PCRE2_ERROR_TOOMANYREPLACE;
404 /* Copy the text leading up to the match. */
406 if (rc == 0) rc = ovector_count;
407 fraglength = ovector[0] - start_offset;
408 CHECKMEMCPY(subject + start_offset, fraglength);
410 /* Process the replacement string. Literal mode is set by \Q, but only in
411 extended mode when backslashes are being interpreted. In extended mode we
412 must handle nested substrings that are to be reprocessed. */
420 /* If at the end of a nested substring, pop the stack. */
424 if (ptrstackptr <= 0) break; /* End of replacement string */
425 repend = ptrstack[--ptrstackptr];
426 ptr = ptrstack[--ptrstackptr];
430 /* Handle the next character */
434 if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
443 /* Not in literal mode. */
445 if (*ptr == CHAR_DOLLAR_SIGN)
448 uint32_t special = 0;
451 PCRE2_SIZE sublength;
452 PCRE2_SPTR text1_start = NULL;
453 PCRE2_SPTR text1_end = NULL;
454 PCRE2_SPTR text2_start = NULL;
455 PCRE2_SPTR text2_end = NULL;
457 PCRE2_UCHAR name[33];
459 if (++ptr >= repend) goto BAD;
460 if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
467 if (next == CHAR_LEFT_CURLY_BRACKET)
469 if (++ptr >= repend) goto BAD;
474 if (next == CHAR_ASTERISK)
476 if (++ptr >= repend) goto BAD;
481 if (!star && next >= CHAR_0 && next <= CHAR_9)
483 group = next - CHAR_0;
484 while (++ptr < repend)
487 if (next < CHAR_0 || next > CHAR_9) break;
488 group = group * 10 + next - CHAR_0;
490 /* A check for a number greater than the hightest captured group
491 is sufficient here; no need for a separate overflow check. If unknown
492 groups are to be treated as unset, just skip over any remaining
493 digits and carry on. */
495 if (group > code->top_bracket)
497 if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
499 while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
504 rc = PCRE2_ERROR_NOSUBSTRING;
512 const uint8_t *ctypes = code->tables + ctypes_offset;
513 while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
516 if (n > 32) goto BAD;
517 if (++ptr >= repend) break;
520 if (n == 0) goto BAD;
524 /* In extended mode we recognize ${name:+set text:unset text} and
525 ${name:-default text}. */
529 if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
530 !star && ptr < repend - 2 && next == CHAR_COLON)
533 if (special != CHAR_PLUS && special != CHAR_MINUS)
535 rc = PCRE2_ERROR_BADSUBSTITUTION;
540 rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
541 if (rc != 0) goto PTREXIT;
544 if (special == CHAR_PLUS && *ptr == CHAR_COLON)
547 rc = find_text_end(code, &ptr, repend, TRUE);
548 if (rc != 0) goto PTREXIT;
555 if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
557 rc = PCRE2_ERROR_REPMISSINGBRACE;
565 /* Have found a syntactically correct group number or name, or *name.
566 Only *MARK is currently recognized. */
570 if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
572 PCRE2_SPTR mark = pcre2_get_mark(match_data);
575 PCRE2_SPTR mark_start = mark;
576 while (*mark != 0) mark++;
577 fraglength = mark - mark_start;
578 CHECKMEMCPY(mark_start, fraglength);
584 /* Substitute the contents of a group. We don't use substring_copy
585 functions any more, in order to support case forcing. */
589 PCRE2_SPTR subptr, subptrend;
591 /* Find a number for a named group. In case there are duplicate names,
592 search for the first one that is set. If the name is not found when
593 PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
594 non-existent group. */
598 PCRE2_SPTR first, last, entry;
599 rc = pcre2_substring_nametable_scan(code, name, &first, &last);
600 if (rc == PCRE2_ERROR_NOSUBSTRING &&
601 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
603 group = code->top_bracket + 1;
607 if (rc < 0) goto PTREXIT;
608 for (entry = first; entry <= last; entry += rc)
610 uint32_t ng = GET2(entry, 0);
611 if (ng < ovector_count)
613 if (group < 0) group = ng; /* First in ovector */
614 if (ovector[ng*2] != PCRE2_UNSET)
616 group = ng; /* First that is set */
622 /* If group is still negative, it means we did not find a group
623 that is in the ovector. Just set the first group. */
625 if (group < 0) group = GET2(first, 0);
629 /* We now have a group that is identified by number. Find the length of
630 the captured string. If a group in a non-special substitution is unset
631 when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
633 rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
636 if (rc == PCRE2_ERROR_NOSUBSTRING &&
637 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
639 rc = PCRE2_ERROR_UNSET;
641 if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */
642 if (special == 0) /* Plain substitution */
644 if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
645 goto PTREXIT; /* Else error */
649 /* If special is '+' we have a 'set' and possibly an 'unset' text,
650 both of which are reprocessed when used. If special is '-' we have a
651 default text for when the group is unset; it must be reprocessed. */
655 if (special == CHAR_MINUS)
657 if (rc == 0) goto LITERAL_SUBSTITUTE;
658 text2_start = text1_start;
659 text2_end = text1_end;
662 if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
663 ptrstack[ptrstackptr++] = ptr;
664 ptrstack[ptrstackptr++] = repend;
679 /* Otherwise we have a literal substitution of a group's contents. */
682 subptr = subject + ovector[group*2];
683 subptrend = subject + ovector[group*2 + 1];
685 /* Substitute a literal string, possibly forcing alphabetic case. */
687 while (subptr < subptrend)
689 GETCHARINCTEST(ch, subptr);
692 #ifdef SUPPORT_UNICODE
695 uint32_t type = UCD_CHARTYPE(ch);
696 if (PRIV(ucp_gentype)[type] == ucp_L &&
697 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
698 ch = UCD_OTHERCASE(ch);
703 if (((code->tables + cbits_offset +
704 ((forcecase > 0)? cbit_upper:cbit_lower)
705 )[ch/8] & (1 << (ch%8))) == 0)
706 ch = (code->tables + fcc_offset)[ch];
708 forcecase = forcecasereset;
711 #ifdef SUPPORT_UNICODE
712 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
718 CHECKMEMCPY(temp, chlen);
723 /* Handle an escape sequence in extended mode. We can use check_escape()
724 to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
725 the case-forcing escapes are not supported in pcre2_compile() so must be
728 else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
729 *ptr == CHAR_BACKSLASH)
733 if (ptr < repend - 1) switch (ptr[1])
736 forcecase = forcecasereset = -1;
747 forcecase = forcecasereset = 1;
761 ptr++; /* Point after \ */
762 rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
763 code->overall_options, FALSE, NULL);
764 if (errorcode != 0) goto BADESCAPE;
769 forcecase = forcecasereset = 0;
776 case 0: /* Data character */
784 /* Handle a literal code unit */
789 GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */
794 #ifdef SUPPORT_UNICODE
797 uint32_t type = UCD_CHARTYPE(ch);
798 if (PRIV(ucp_gentype)[type] == ucp_L &&
799 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
800 ch = UCD_OTHERCASE(ch);
805 if (((code->tables + cbits_offset +
806 ((forcecase > 0)? cbit_upper:cbit_lower)
807 )[ch/8] & (1 << (ch%8))) == 0)
808 ch = (code->tables + fcc_offset)[ch];
810 forcecase = forcecasereset;
813 #ifdef SUPPORT_UNICODE
814 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
820 CHECKMEMCPY(temp, chlen);
821 } /* End handling a literal code unit */
822 } /* End of loop for scanning the replacement. */
824 /* The replacement has been copied to the output. Save the details of this
825 match. See above for how this data is used. If we matched an empty string, do
826 the magic for global matches. Finally, update the start offset to point to
827 the rest of the subject string. */
829 ovecsave[0] = ovector[0];
830 ovecsave[1] = ovector[1];
831 ovecsave[2] = start_offset;
833 goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
834 PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
835 start_offset = ovector[1];
836 } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */
838 /* Copy the rest of the subject. */
840 fraglength = length - start_offset;
841 CHECKMEMCPY(subject + start_offset, fraglength);
843 CHECKMEMCPY(temp , 1);
845 /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
846 and matching has carried on after a full buffer, in order to compute the length
847 needed. Otherwise, an overflow generates an immediate error return. */
851 rc = PCRE2_ERROR_NOMEMORY;
852 *blength = buff_length + extra_needed;
855 /* After a successful execution, return the number of substitutions and set the
856 length of buffer used, excluding the trailing zero. */
861 *blength = buff_offset - 1;
865 if (match_data_created) pcre2_match_data_free(match_data);
866 else match_data->rc = rc;
870 rc = PCRE2_ERROR_NOMEMORY;
874 rc = PCRE2_ERROR_BADREPLACEMENT;
878 rc = PCRE2_ERROR_BADREPESCAPE;
881 *blength = (PCRE2_SIZE)(ptr - replacement);
885 /* End of pcre2_substitute.c */