1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2018 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
42 /* This module contains a PCRE private debugging function for printing out the
43 internal form of a compiled regular expression, along with some supporting
44 local functions. This source file is #included in pcre2test.c at each supported
45 code unit width, with PCRE2_SUFFIX set appropriately, just like the functions
46 that comprise the library. It can also optionally be included in
47 pcre2_compile.c for detailed debugging in error situations. */
50 /* Tables of operator names. The same 8-bit table is used for all code unit
51 widths, so it must be defined only once. The list itself is defined in
52 pcre2_internal.h, which is #included by pcre2test before this file. */
54 #ifndef OP_LISTS_DEFINED
55 static const char *OP_names[] = { OP_NAME_LIST };
56 #define OP_LISTS_DEFINED
59 /* The functions and tables herein must all have mode-dependent names. */
61 #define OP_lengths PCRE2_SUFFIX(OP_lengths_)
62 #define get_ucpname PCRE2_SUFFIX(get_ucpname_)
63 #define pcre2_printint PCRE2_SUFFIX(pcre2_printint_)
64 #define print_char PCRE2_SUFFIX(print_char_)
65 #define print_custring PCRE2_SUFFIX(print_custring_)
66 #define print_custring_bylen PCRE2_SUFFIX(print_custring_bylen_)
67 #define print_prop PCRE2_SUFFIX(print_prop_)
69 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
70 the definition is next to the definition of the opcodes in pcre2_internal.h.
71 The contents of the table are, however, mode-dependent. */
73 static const uint8_t OP_lengths[] = { OP_LENGTHS };
77 /*************************************************
78 * Print one character from a string *
79 *************************************************/
81 /* In UTF mode the character may occupy more than one code unit.
85 ptr pointer to first code unit of the character
86 utf TRUE if string is UTF (will be FALSE if UTF is not supported)
88 Returns: number of additional code units used
92 print_char(FILE *f, PCRE2_SPTR ptr, BOOL utf)
95 BOOL one_code_unit = !utf;
97 /* If UTF is supported and requested, check for a valid single code unit. */
99 #ifdef SUPPORT_UNICODE
102 #if PCRE2_CODE_UNIT_WIDTH == 8
103 one_code_unit = c < 0x80;
104 #elif PCRE2_CODE_UNIT_WIDTH == 16
105 one_code_unit = (c & 0xfc00) != 0xd800;
107 one_code_unit = (c & 0xfffff800u) != 0xd800u;
108 #endif /* CODE_UNIT_WIDTH */
110 #endif /* SUPPORT_UNICODE */
112 /* Handle a valid one-code-unit character at any width. */
116 if (PRINTABLE(c)) fprintf(f, "%c", (char)c);
117 else if (c < 0x80) fprintf(f, "\\x%02x", c);
118 else fprintf(f, "\\x{%02x}", c);
122 /* Code for invalid UTF code units and multi-unit UTF characters is different
123 for each width. If UTF is not supported, control should never get here, but we
124 need a return statement to keep the compiler happy. */
126 #ifndef SUPPORT_UNICODE
130 /* Malformed UTF-8 should occur only if the sanity check has been turned off.
131 Rather than swallow random bytes, just stop if we hit a bad one. Print it with
132 \X instead of \x as an indication. */
134 #if PCRE2_CODE_UNIT_WIDTH == 8
135 if ((c & 0xc0) != 0xc0)
137 fprintf(f, "\\X{%x}", c); /* Invalid starting byte */
143 int a = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes */
145 c = (c & PRIV(utf8_table3)[a]) << s;
146 for (i = 1; i <= a; i++)
148 if ((ptr[i] & 0xc0) != 0x80)
150 fprintf(f, "\\X{%x}", c); /* Invalid secondary byte */
154 c |= (ptr[i] & 0x3f) << s;
156 fprintf(f, "\\x{%x}", c);
159 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
161 /* UTF-16: rather than swallow a low surrogate, just stop if we hit a bad one.
162 Print it with \X instead of \x as an indication. */
164 #if PCRE2_CODE_UNIT_WIDTH == 16
165 if ((ptr[1] & 0xfc00) != 0xdc00)
167 fprintf(f, "\\X{%x}", c);
170 c = (((c & 0x3ff) << 10) | (ptr[1] & 0x3ff)) + 0x10000;
171 fprintf(f, "\\x{%x}", c);
173 #endif /* PCRE2_CODE_UNIT_WIDTH == 16 */
175 /* For UTF-32 we get here only for a malformed code unit, which should only
176 occur if the sanity check has been turned off. Print it with \X instead of \x
179 #if PCRE2_CODE_UNIT_WIDTH == 32
180 fprintf(f, "\\X{%x}", c);
182 #endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
183 #endif /* SUPPORT_UNICODE */
188 /*************************************************
189 * Print string as a list of code units *
190 *************************************************/
192 /* These take no account of UTF as they always print each individual code unit.
193 The string is zero-terminated for print_custring(); the length is given for
194 print_custring_bylen().
198 ptr point to the string
199 len length for print_custring_bylen()
205 print_custring(FILE *f, PCRE2_SPTR ptr)
210 if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x{%x}", c);
215 print_custring_bylen(FILE *f, PCRE2_SPTR ptr, PCRE2_UCHAR len)
217 for (; len > 0; len--)
220 if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x{%x}", c);
226 /*************************************************
227 * Find Unicode property name *
228 *************************************************/
230 /* When there is no UTF/UCP support, the table of names does not exist. This
231 function should not be called in such configurations, because a pattern that
232 tries to use Unicode properties won't compile. Rather than put lots of #ifdefs
233 into the main code, however, we just put one into this function. */
236 get_ucpname(unsigned int ptype, unsigned int pvalue)
238 #ifdef SUPPORT_UNICODE
240 for (i = PRIV(utt_size) - 1; i >= 0; i--)
242 if (ptype == PRIV(utt)[i].type && pvalue == PRIV(utt)[i].value) break;
244 return (i >= 0)? PRIV(utt_names) + PRIV(utt)[i].name_offset : "??";
245 #else /* No UTF support */
249 #endif /* SUPPORT_UNICODE */
254 /*************************************************
255 * Print Unicode property value *
256 *************************************************/
258 /* "Normal" properties can be printed from tables. The PT_CLIST property is a
259 pseudo-property that contains a pointer to a list of case-equivalent
264 code pointer in the compiled code
265 before text to print before
266 after text to print after
272 print_prop(FILE *f, PCRE2_SPTR code, const char *before, const char *after)
274 if (code[1] != PT_CLIST)
276 fprintf(f, "%s%s %s%s", before, OP_names[*code], get_ucpname(code[1],
281 const char *not = (*code == OP_PROP)? "" : "not ";
282 const uint32_t *p = PRIV(ucd_caseless_sets) + code[2];
283 fprintf (f, "%s%sclist", before, not);
284 while (*p < NOTACHAR) fprintf(f, " %04x", *p++);
285 fprintf(f, "%s", after);
291 /*************************************************
292 * Print compiled pattern *
293 *************************************************/
295 /* The print_lengths flag controls whether offsets and lengths of items are
296 printed. Lenths can be turned off from pcre2test so that automatic tests on
297 bytecode can be written that do not depend on the value of LINK_SIZE.
300 re a compiled pattern
301 f the file to write to
302 print_lengths show various lengths
308 pcre2_printint(pcre2_code *re, FILE *f, BOOL print_lengths)
310 PCRE2_SPTR codestart, nametable, code;
311 uint32_t nesize = re->name_entry_size;
312 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
314 nametable = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code));
315 code = codestart = nametable + re->name_count * re->name_entry_size;
322 const char *flag = " ";
323 unsigned int extra = 0;
326 fprintf(f, "%3d ", (int)(code - codestart));
332 /* ========================================================================== */
333 /* These cases are never obeyed. This is a fudge that causes a compile-
334 time error if the vectors OP_names or OP_lengths, which are indexed
335 by opcode, are not the correct length. It seems to be the only way to do
336 such a check at compile time, as the sizeof() operator does not work in
337 the C preprocessor. */
339 case OP_TABLE_LENGTH:
340 case OP_TABLE_LENGTH +
341 ((sizeof(OP_names)/sizeof(const char *) == OP_TABLE_LENGTH) &&
342 (sizeof(OP_lengths) == OP_TABLE_LENGTH)):
344 /* ========================================================================== */
347 fprintf(f, " %s\n", OP_names[*code]);
348 fprintf(f, "------------------------------------------------------------------\n");
356 code += 1 + print_char(f, code, utf);
358 while (*code == OP_CHAR);
367 code += 1 + print_char(f, code, utf);
369 while (*code == OP_CHARI);
377 if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
378 else fprintf(f, " ");
379 fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE));
394 case OP_ASSERTBACK_NOT:
399 if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
400 else fprintf(f, " ");
401 fprintf(f, "%s", OP_names[*code]);
405 fprintf(f, " %s %d", OP_names[*code], GET2(code, 1));
409 fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
414 PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
415 fprintf(f, " %s Cond ref <", flag);
416 print_custring(f, entry);
417 fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
424 fprintf(f, " Cond recurse any");
426 fprintf(f, " Cond recurse %d", c);
431 PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
432 fprintf(f, " %s Cond recurse <", flag);
433 print_custring(f, entry);
434 fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
439 fprintf(f, " Cond false");
443 fprintf(f, " Cond true");
473 case OP_TYPEMINQUERY:
474 case OP_TYPEPOSQUERY:
475 fprintf(f, " %s ", flag);
477 if (*code >= OP_TYPESTAR)
479 if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
481 print_prop(f, code + 1, "", " ");
484 else fprintf(f, "%s", OP_names[code[1]]);
486 else extra = print_char(f, code+1, utf);
487 fprintf(f, "%s", OP_names[*code]);
500 fprintf(f, " %s ", flag);
501 extra = print_char(f, code + 1 + IMM2_SIZE, utf);
503 if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,");
504 fprintf(f, "%d}", GET2(code,1));
505 if (*code == OP_MINUPTO || *code == OP_MINUPTOI) fprintf(f, "?");
506 else if (*code == OP_POSUPTO || *code == OP_POSUPTOI) fprintf(f, "+");
513 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
515 print_prop(f, code + IMM2_SIZE + 1, " ", " ");
518 else fprintf(f, " %s", OP_names[code[1 + IMM2_SIZE]]);
520 if (*code != OP_TYPEEXACT) fprintf(f, "0,");
521 fprintf(f, "%d}", GET2(code,1));
522 if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
523 else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
530 fprintf(f, " %s [^", flag);
531 extra = print_char(f, code + 1, utf);
542 case OP_NOTMINQUERYI:
543 case OP_NOTPOSQUERYI:
556 fprintf(f, " %s [^", flag);
557 extra = print_char(f, code + 1, utf);
558 fprintf(f, "]%s", OP_names[*code]);
572 fprintf(f, " %s [^", flag);
573 extra = print_char(f, code + 1 + IMM2_SIZE, utf);
575 if (*code != OP_NOTEXACT && *code != OP_NOTEXACTI) fprintf(f, "0,");
576 fprintf(f, "%d}", GET2(code,1));
577 if (*code == OP_NOTMINUPTO || *code == OP_NOTMINUPTOI) fprintf(f, "?");
579 if (*code == OP_NOTPOSUPTO || *code == OP_NOTPOSUPTOI) fprintf(f, "+");
583 if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
584 else fprintf(f, " ");
585 fprintf(f, "%s", OP_names[*code]);
592 fprintf(f, " %s \\%d", flag, GET2(code,1));
593 ccode = code + OP_lengths[*code];
594 goto CLASS_REF_REPEAT;
601 PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
602 fprintf(f, " %s \\k<", flag);
603 print_custring(f, entry);
604 fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
606 ccode = code + OP_lengths[*code];
607 goto CLASS_REF_REPEAT;
610 fprintf(f, " %s %d %d %d", OP_names[*code], code[1 + 2*LINK_SIZE],
611 GET(code, 1), GET(code, 1 + LINK_SIZE));
615 c = code[1 + 4*LINK_SIZE];
616 fprintf(f, " %s %c", OP_names[*code], c);
617 extra = GET(code, 1 + 2*LINK_SIZE);
618 print_custring_bylen(f, code + 2 + 4*LINK_SIZE, extra - 3 - 4*LINK_SIZE);
619 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
620 if (c == PRIV(callout_start_delims)[i])
622 c = PRIV(callout_end_delims)[i];
625 fprintf(f, "%c %d %d %d", c, GET(code, 1 + 3*LINK_SIZE), GET(code, 1),
626 GET(code, 1 + LINK_SIZE));
631 print_prop(f, code, " ", "");
634 /* OP_XCLASS cannot occur in 8-bit, non-UTF mode. However, there's no harm
635 in having this code always here, and it makes it less messy without all
642 unsigned int min, max;
644 BOOL invertmap = FALSE;
646 uint8_t inverted_map[32];
650 if (*code == OP_XCLASS)
652 extra = GET(code, 1);
653 ccode = code + LINK_SIZE + 1;
654 printmap = (*ccode & XCL_MAP) != 0;
655 if ((*ccode & XCL_NOT) != 0)
657 invertmap = (*ccode & XCL_HASPROP) == 0;
668 /* Print a bit map */
672 map = (uint8_t *)ccode;
675 for (i = 0; i < 32; i++) inverted_map[i] = ~map[i];
679 for (i = 0; i < 256; i++)
681 if ((map[i/8] & (1 << (i&7))) != 0)
684 for (j = i+1; j < 256; j++)
685 if ((map[j/8] & (1 << (j&7))) == 0) break;
686 if (i == '-' || i == ']') fprintf(f, "\\");
687 if (PRINTABLE(i)) fprintf(f, "%c", i);
688 else fprintf(f, "\\x%02x", i);
691 if (j != i + 1) fprintf(f, "-");
692 if (j == '-' || j == ']') fprintf(f, "\\");
693 if (PRINTABLE(j)) fprintf(f, "%c", j);
694 else fprintf(f, "\\x%02x", j);
699 ccode += 32 / sizeof(PCRE2_UCHAR);
702 /* For an XCLASS there is always some additional data */
704 if (*code == OP_XCLASS)
707 while ((ch = *ccode++) != XCL_END)
710 const char *notch = "";
721 unsigned int ptype = *ccode++;
722 unsigned int pvalue = *ccode++;
727 fprintf(f, "[:%sgraph:]", notch);
731 fprintf(f, "[:%sprint:]", notch);
735 fprintf(f, "[:%spunct:]", notch);
739 fprintf(f, "\\%c{%s}", (not? 'P':'p'),
740 get_ucpname(ptype, pvalue));
747 ccode += 1 + print_char(f, ccode, utf);
751 ccode += 1 + print_char(f, ccode, utf);
758 /* Indicate a non-UTF class which was created by negation */
760 fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
762 /* Handle repeats after a class or a back reference */
776 fprintf(f, "%s", OP_names[*ccode]);
777 extra += OP_lengths[*ccode];
784 max = GET2(ccode,1 + IMM2_SIZE);
785 if (max == 0) fprintf(f, "{%u,}", min);
786 else fprintf(f, "{%u,%u}", min, max);
787 if (*ccode == OP_CRMINRANGE) fprintf(f, "?");
788 else if (*ccode == OP_CRPOSRANGE) fprintf(f, "+");
789 extra += OP_lengths[*ccode];
792 /* Do nothing if it's not a repeat; this code stops picky compilers
793 warning about the lack of a default code path. */
806 fprintf(f, " %s ", OP_names[*code]);
807 print_custring_bylen(f, code + 2, code[1]);
812 fprintf(f, " %s", OP_names[*code]);
820 /* Anything else is just an item with no data, but possibly a flag. */
823 fprintf(f, " %s %s", flag, OP_names[*code]);
827 code += OP_lengths[*code] + extra;
832 /* End of pcre2_printint.c */