1 # This set of tests checks the API, internals, and non-Perl stuff for UTF
2 # support, including Unicode properties. However, tests that give different
3 # results in 8-bit, 16-bit, and 32-bit modes are excluded (see tests 10 and
6 #newline_default lf any anycrlf
8 # PCRE2 and Perl disagree about the characteristics of certain Unicode
9 # characters. For example, 061C was considered by Perl to be Arabic, though
10 # it was not listed as such in the Unicode Scripts.txt file for Unicode 8.
11 # However, it *is* in that file for Unicode 10, but when I came to re-check,
12 # Perl had changed in the meantime, with 5.026 not recognizing it as Arabic.
14 # 2066-2069 are graphic and printable according to Perl, though they are
15 # actually "isolate" control characters. That is why the following tests are
16 # here rather than in test 4.
21 /^[[:graph:]]+$/utf,ucp
29 /^[[:print:]]+$/utf,ucp
37 /^[[:^graph:]]+$/utf,ucp
38 \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}
39 \x{2028}\x{2029}\x{202f}\x{2065}\x{2066}\x{2067}\x{2068}\x{2069}
41 /^[[:^print:]]+$/utf,ucp
42 \x{09}\x{1D}\x{85}\x{61c}\x{2028}\x{2029}\x{2065}\x{2066}\x{2067}
45 # Perl does not consider U+180e to be a space character. It is true that it
46 # does not appear in the Unicode PropList.txt file as such, but in many other
47 # sources it is listed as a space, and has been treated as such in PCRE for
50 /^>[[:blank:]]*/utf,ucp
51 >\x{20}\x{a0}\x{1680}\x{180e}\x{2000}\x{202f}\x{9}\x{b}\x{2028}
54 A\x{85}\x{180e}\x{2005}Z
60 /^[[:graph:]]+$/utf,ucp
64 /^[[:print:]]+$/utf,ucp
67 /^[[:^graph:]]+$/utf,ucp
68 \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{61c}\x{1680}\x{180e}
70 /^[[:^print:]]+$/utf,ucp
74 # End of U+180E tests.
76 # ---------------------------------------------------------------------
106 /^\x{100}a\x{1234}/utf
109 /\x{0041}\x{2262}\x{0391}\x{002e}/IB,utf
110 \x{0041}\x{2262}\x{0391}\x{002e}
113 \x{212ab}\x{212ab}\x{212ab}\x{861}X
116 \x{212ab}\x{212ab}\x{212ab}\x{861}
125 /\x{100}*(\d+|"(?1)")/utf
141 /[\x{200}-\x{100}]/utf
156 Ö # Matches without Study
160 Ö <-- Same with Study
164 Ö # Matches without Study
168 Ö <-- Same with Study
171 /[^\x{100}]abc(xyz(?1))/IB,utf
173 /(\x{100}(b(?2)c))?/IB,utf
175 /(\x{100}(b(?2)c)){0,2}/IB,utf
177 /(\x{100}(b(?1)c))?/IB,utf
179 /(\x{100}(b(?1)c)){0,2}/IB,utf
188 # Use no_start_optimize because the first code unit is different in 8-bit from
191 /^\ሴ/IB,utf,no_start_optimize
193 /()()()()()()()()()()
200 /^[\x{100}\E-\Q\E\x{150}]/B,utf
202 /^[\QĀ\E-\QŐ\E]/B,utf
204 /^abc./gmx,newline=any,utf
205 abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x{0085}abc7 \x{2028}abc8 \x{2029}abc9 JUNK
207 /abc.$/gmx,newline=any,utf
208 abc1\x0a abc2\x0b abc3\x0c abc4\x0d abc5\x0d\x0a abc6\x{0085} abc7\x{2028} abc8\x{2029} abc9
210 /^a\Rb/bsr=unicode,utf
222 /^a\R*b/bsr=unicode,utf
228 a\x0c\x{2028}\x{2029}b
233 /^a\R+b/bsr=unicode,utf
238 a\x0c\x{2028}\x{2029}b
245 /^a\R{1,3}b/bsr=unicode,utf
263 /\H*\h+\V?\v{3,4}/utf
264 \x09\x20\x{a0}X\x0a\x0b\x0c\x0d\x0a
265 \x09\x20\x{a0}\x0a\x0b\x0c\x0d\x0a
266 \x09\x20\x{a0}\x0a\x0b\x0c
268 \x09\x20\x{a0}\x0a\x0b
271 \x{3001}\x{3000}\x{2030}\x{2028}
276 /\H*\h+\V?\v{3,4}/utf
277 \x{1680}\x{180e}\x{2007}X\x{2028}\x{2029}\x0c\x0d\x0a
278 \x09\x{205f}\x{a0}\x0a\x{2029}\x0c\x{2028}\x0a
279 \x09\x20\x{202f}\x0a\x0b\x0c
281 \x09\x{200a}\x{a0}\x{2028}\x0b
287 >\x{1680}\x{180e}\x{2000}\x{2003}\x{200a}\x{202f}\x{205f}\x{3000}<
298 /a\Rb/I,bsr=anycrlf,utf
306 /a\Rb/I,bsr=unicode,utf
313 /a\R?b/I,bsr=anycrlf,utf
321 /a\R?b/I,bsr=unicode,utf
328 /.*a.*=.b.*/utf,newline=any
329 QQQ\x{2029}ABCaXYZ=!bPQR
336 /a[^]b/utf,alt_bsux,allow_empty_class,match_unset_backref
342 /a[^]+b/utf,alt_bsux,allow_empty_class,match_unset_backref
351 /X/newline=any,utf,firstline
379 X\x{123}\x{123}\x{123}\=ps
380 X\x{123}\x{123}\x{123}\x{123}\=ps
386 X\x{123}\x{123}\x{123}\=ps
387 X\x{123}\x{123}\x{123}\x{123}\=ps
393 X\x{123}\x{123}\x{123}\=ps
394 X\x{123}\x{123}\x{123}\x{123}\=ps
401 X\x{123}\x{123}\x{123}x\=ps
402 X\x{123}\x{123}\x{123}\x{123}x\=ps
409 X\x{123}\x{123}\x{123}x\=ps
410 X\x{123}\x{123}\x{123}\x{123}x\=ps
417 X\x{123}\x{123}\x{123}x\=ps
418 X\x{123}\x{123}\x{123}\x{123}x\=ps
466 X\x{123}\x{123}\x{123}\=ps
467 X\x{123}\x{123}\x{123}\x{123}\=ps
473 X\x{123}\x{123}\x{123}\=ps
474 X\x{123}\x{123}\x{123}\x{123}\=ps
480 X\x{123}\x{123}\x{123}\=ps
481 X\x{123}\x{123}\x{123}\x{123}\=ps
504 /X[abc\x{123}]{2,4}b/utf
508 X\x{123}\x{123}\x{123}\=ps
509 X\x{123}\x{123}\x{123}\x{123}\=ps
511 /X[abc\x{123}]{2,4}?b/utf
515 X\x{123}\x{123}\x{123}\=ps
516 X\x{123}\x{123}\x{123}\x{123}\=ps
518 /X[abc\x{123}]{2,4}+b/utf
522 X\x{123}\x{123}\x{123}\=ps
523 X\x{123}\x{123}\x{123}\x{123}\=ps
550 X\x{123}\x{123}\x{123}\=ps
551 X\x{123}\x{123}\x{123}\x{123}\=ps
557 X\x{123}\x{123}\x{123}\=ps
558 X\x{123}\x{123}\x{123}\x{123}\=ps
564 X\x{123}\x{123}\x{123}\=ps
565 X\x{123}\x{123}\x{123}\x{123}\=ps
588 /(\x{123})X\1{2,4}b/utf
591 \x{123}X\x{123}\x{123}\=ps
592 \x{123}X\x{123}\x{123}\x{123}\=ps
593 \x{123}X\x{123}\x{123}\x{123}\x{123}\=ps
595 /(\x{123})X\1{2,4}?b/utf
598 \x{123}X\x{123}\x{123}\=ps
599 \x{123}X\x{123}\x{123}\x{123}\=ps
600 \x{123}X\x{123}\x{123}\x{123}\x{123}\=ps
602 /(\x{123})X\1{2,4}+b/utf
605 \x{123}X\x{123}\x{123}\=ps
606 \x{123}X\x{123}\x{123}\x{123}\=ps
607 \x{123}X\x{123}\x{123}\x{123}\x{123}\=ps
638 /\sxxx\s/utf,tables=2
645 'A#хц'Bx,newline=any,utf
648 PQ'Bx,newline=any,utf
651 z#XX?/Bx,newline=any,utf
654 z#х?/Bx,newline=any,utf
656 /\g{A}xxx#bXX(?'A'123)
\r(?'A'456)/Bx,newline=any,utf
658 /\g{A}xxx#bх(?'A'123)
\r(?'A'456)/Bx,newline=any,utf
674 /[^\x{1234}]+?/Ii,utf
676 /[^\x{1234}]++/Ii,utf
678 /[^\x{1234}]{2}/Ii,utf
692 /\x{d7ff}\x{e000}/utf
699 \x{1681}\x{200b}\x{1680}\x{2000}\x{202f}\x{3000}
700 \x{3001}\x{2fff}\x{200a}\x{a0}\x{2000}
703 \x{1681}\x{200b}\x{1680}\x{2000}\x{202f}\x{3000}
704 \x{3001}\x{2fff}\x{200a}\x{a0}\x{2000}
707 \x{1680}\x{180e}\x{167f}\x{1681}\x{180d}\x{180f}
708 \x{2000}\x{200a}\x{1fff}\x{200b}
709 \x{202f}\x{205f}\x{202e}\x{2030}\x{205e}\x{2060}
710 \x{a0}\x{3000}\x{9f}\x{a1}\x{2fff}\x{3001}
713 \x{1680}\x{180e}\x{167f}\x{1681}\x{180d}\x{180f}
714 \x{2000}\x{200a}\x{1fff}\x{200b}
715 \x{202f}\x{205f}\x{202e}\x{2030}\x{205e}\x{2060}
716 \x{a0}\x{3000}\x{9f}\x{a1}\x{2fff}\x{3001}
719 \x{2027}\x{2030}\x{2028}\x{2029}
720 \x09\x0e\x{84}\x{86}\x{85}\x0a\x0b\x0c\x0d
723 \x{2027}\x{2030}\x{2028}\x{2029}
724 \x09\x0e\x{84}\x{86}\x{85}\x0a\x0b\x0c\x0d
727 \x{2028}\x{2029}\x{2027}\x{2030}
728 \x{85}\x0a\x0b\x0c\x0d\x09\x0e\x{84}\x{86}
731 \x{2028}\x{2029}\x{2027}\x{2030}
732 \x{85}\x0a\x0b\x0c\x0d\x09\x0e\x{84}\x{86}
735 \x{2027}\x{2030}\x{2028}\x{2029}
736 \x09\x0e\x{84}\x{86}\x{85}\x0a\x0b\x0c\x0d
781 /.{2,3}/utf,newline=crlf
789 /.{2,3}?/utf,newline=crlf
797 /[^\x{100}][^\x{1234}][^\x{ffff}][^\x{10000}][^\x{10ffff}]/B,utf
799 /[^\x{100}][^\x{1234}][^\x{ffff}][^\x{10000}][^\x{10ffff}]/Bi,utf
801 /[^\x{100}]*[^\x{10000}]+[^\x{10ffff}]??[^\x{8000}]{4,}[^\x{7fff}]{2,9}?[^\x{fffff}]{5,6}+/B,utf
803 /[^\x{100}]*[^\x{10000}]+[^\x{10ffff}]??[^\x{8000}]{4,}[^\x{7fff}]{2,9}?[^\x{fffff}]{5,6}+/Bi,utf
805 /(?<=\x{1234}\x{1234})\bxy/I,utf
811 /\u0100/B,utf,alt_bsux,allow_empty_class,match_unset_backref
813 /[\u0100-\u0200]/B,utf,alt_bsux,allow_empty_class,match_unset_backref
815 /\ud800/utf,alt_bsux,allow_empty_class,match_unset_backref
820 /[b-d\x{200}-\x{250}]*[ae-h]?#[\x{200}-\x{250}]{0,8}[\x00-\xff]*#[\x{200}-\x{250}]+[a-z]/B,utf
830 /[abc\p{L}\x{0660}]/IB,utf
854 /\p{Yi}+(\P{Yi}+)(?1)/
860 /[\P{Yi}\P{Yi}\P{Yi}A]/
864 /[^\P{Yi}\P{Yi}\P{Yi}A]/
880 /(\P{Yi}{0,3}?\277)*/
882 /(\p{Yi}{0,3}+\277)*/
886 \x{2028}\x{2028}\x{2028}
905 \x{dfff}\=no_utf_check
921 $\x{a2}\x{a3}\x{a4}\x{a5}\x{a6}
937 # These are here because Perl has problems with the negative versions of the
938 # properties and has changed how it behaves for caseless matching.
969 /[\x{c0}\x{391}]/i,utf
973 # The next two are special cases where the lengths of the different cases of
974 # the same character differ. The first went wrong with heap frame storage; the
975 # second was broken in all cases.
977 /^\x{023a}+?(\x{0130}+)/i,utf
978 \x{023a}\x{2c65}\x{0130}
980 /^\x{023a}+([^X])/i,utf
983 /\x{c0}+\x{116}+/i,utf
984 \x{c0}\x{e0}\x{116}\x{117}
986 /[\x{c0}\x{116}]+/i,utf
987 \x{c0}\x{e0}\x{116}\x{117}
1003 # The next two should be Perl-compatible, but it fails to match \x{e0}. PCRE
1004 # will match it only with UCP support, because without that it has no notion
1005 # of case for anything other than the ASCII letters.
1015 # These are PCRE's extra properties to help with Unicodizing \d etc.
1027 ABCD1234\x{6ca}\x{a6c}\x{10a7}_
1032 \x{6ca}\x{a6c}\x{10a7}_
1035 ABCD1234\x{6ca}\x{a6c}\x{10a7}_
1038 ABCD1234\x{6ca}\x{a6c}\x{10a7}_
1041 \x{6ca}\x{a6c}\x{10a7}_
1053 ABCD1234\x{6ca}\x{a6c}\x{10a7}_
1058 >\x{1680}\x{2028}\x{0b}
1064 > \x{09}\x{0a}\x{0c}\x{0d}\x{a0}\x{1680}\x{2028}\x{0b}
1067 >\x{1680}\x{2028}\x{0b}
1070 > \x{09}\x{0a}\x{0c}\x{0d}\x{a0}\x{1680}\x{2028}\x{0b}
1073 > \x{09}\x{0a}\x{0c}\x{0d}\x{a0}\x{1680}\x{2028}\x{0b}
1075 /^>\p{Xsp}{2,9}?/utf
1076 > \x{09}\x{0a}\x{0c}\x{0d}\x{a0}\x{1680}\x{2028}\x{0b}
1082 > \x{09}\x{0a}\x{0c}\x{0d}\x{a0}\x{1680}\x{2028}\x{0b}
1085 >\x{1680}\x{2028}\x{0b}
1091 > \x{09}\x{0a}\x{0c}\x{0d}\x{a0}\x{1680}\x{2028}\x{0b}
1094 >\x{1680}\x{2028}\x{0b}
1097 > \x{09}\x{0a}\x{0c}\x{0d}\x{a0}\x{1680}\x{2028}\x{0b}
1100 > \x{09}\x{0a}\x{0c}\x{0d}\x{a0}\x{1680}\x{2028}\x{0b}
1102 /^>\p{Xps}{2,9}?/utf
1103 > \x{09}\x{0a}\x{0c}\x{0d}\x{a0}\x{1680}\x{2028}\x{0b}
1109 > \x{09}\x{0a}\x{0c}\x{0d}\x{a0}\x{1680}\x{2028}\x{0b}
1122 ABCD1234\x{6ca}\x{a6c}\x{10a7}_
1125 \x{6ca}\x{a6c}\x{10a7}_
1128 ABCD1234\x{6ca}\x{a6c}\x{10a7}_
1131 A_B12\x{6ca}\x{a6c}\x{10a7}
1134 \x{6ca}\x{a6c}\x{10a7}_
1147 ABCD1234\x{6ca}\x{a6c}\x{10a7}_
1149 # A check not in UTF-8 mode
1154 # Some negative checks
1169 \x{1680}\x{6f4}\x{1680}
1187 \x{1680}\x{6f4}\x{1680}
1224 # Unicode properties for \b abd \B
1229 \x{37e}\x{376}\x{371}\x{393}\x{394}
1230 !\x{c0}++\x{c1}\x{c2}
1233 # Without PCRE_UCP, non-ASCII always fail, even if < 256
1239 \x{37e}\x{376}\x{371}\x{393}\x{394}
1240 !\x{c0}++\x{c1}\x{c2}
1243 # With PCRE_UCP, non-UTF8 chars that are < 256 still check properties
1247 !\x{c0}++\x{c1}\x{c2}
1250 # Some of these are silly, but they check various combinations
1252 /[[:^alpha:][:^cntrl:]]+/B,utf,ucp
1256 /[[:^cntrl:][:^alpha:]]+/B,utf,ucp
1260 /[[:alpha:]]+/B,utf,ucp
1263 /[[:^alpha:]\S]+/B,utf,ucp
1272 /\p{Lu}+9\p{Lu}+B\p{Lu}+b/B
1274 /\p{^Lu}+9\p{^Lu}+B\p{^Lu}+b/B
1276 /\P{Lu}+9\P{Lu}+B\P{Lu}+b/B
1278 /\p{Han}+X\p{Greek}+\x{370}/B,utf
1280 /\p{Xan}+!\p{Xan}+A/B
1282 /\p{Xsp}+!\p{Xsp}\t/B
1284 /\p{Xps}+!\p{Xps}\t/B
1286 /\p{Xwd}+!\p{Xwd}_/B
1288 /A+\p{N}A+\dB+\p{N}*B+\d*/B,ucp
1290 # These behaved oddly in Perl, so they are kept in this test
1292 /(\x{23a}\x{23a}\x{23a})?\1/i,utf
1294 \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
1300 /(\x{23a}\x{23a}\x{23a})?\1/i,utf
1301 \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
1306 /(\x{23a}\x{23a}\x{23a})\1/i,utf
1308 \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
1314 /(\x{23a}\x{23a}\x{23a})\1/i,utf
1315 \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
1320 /(\x{2c65}\x{2c65})\1/i,utf
1321 \x{2c65}\x{2c65}\x{23a}\x{23a}
1326 /(\x{23a}\x{23a}\x{23a})\1Y/i,utf
1327 X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ
1329 /(\x{2c65}\x{2c65})\1Y/i,utf
1330 X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
1332 # These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE
1362 /^a\X41z/alt_bsux,allow_empty_class,match_unset_backref,dupnames
1391 # These Unicode 6.1.0 scripts are not known to Perl.
1393 /\p{Chakma}\d/utf,ucp
1396 /\p{Takri}\d/utf,ucp
1414 A\x{300}\x{301}A\x{300}\x{301}\=ps
1415 A\x{300}\x{301}A\x{300}\x{301}\=ph
1420 A\x{300}\x{301}A\x{300}\x{301}\=ps
1421 A\x{300}\x{301}A\x{300}\x{301}\=ph
1445 /\x{3a3}*\x{3c2}/Bi,utf
1447 /\x{3a3}{3}/i,utf,aftertext
1448 \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
1450 /\x{3a3}{2,4}/i,utf,aftertext
1451 \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
1453 /\x{3a3}{2,4}?/i,utf,aftertext
1454 \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
1456 /\x{3a3}+./i,utf,aftertext
1457 \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
1459 /\x{3a3}++./i,utf,aftertext
1461 \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
1463 /\x{3a3}*\x{3c2}/Bi,utf
1465 /[^\x{3a3}]*\x{3c2}/Bi,utf
1467 /[^a]*\x{3c2}/Bi,utf
1490 # This property is a PCRE special
1501 $@`\x{a0}\x{1234}\x{e000}**
1506 $@`\x{a0}\x{1234}\x{e000}**
1511 $@`\x{a0}\x{1234}\x{e000}**
1516 $@`\x{a0}\x{1234}\x{e000}**
1521 $@`\x{a0}\x{1234}\x{e000}**
1526 $@`\x{a0}\x{1234}\x{e000}**
1531 $@`\x{a0}\x{1234}\x{e000}**
1536 $@`\x{a0}\x{1234}\x{e000}**
1556 # Some auto-possessification tests
1566 /\p{Any}+\p{Any} \p{Any}+\P{Any} \p{Any}+\p{L&} \p{Any}+\p{L} \p{Any}+\p{Lu} \p{Any}+\p{Han} \p{Any}+\p{Xan} \p{Any}+\p{Xsp} \p{Any}+\p{Xps} \p{Xwd}+\p{Any} \p{Any}+\p{Xuc}/Bx,ucp
1568 /\p{L&}+\p{Any} \p{L&}+\p{L&} \P{L&}+\p{L&} \p{L&}+\p{L} \p{L&}+\p{Lu} \p{L&}+\p{Han} \p{L&}+\p{Xan} \p{L&}+\P{Xan} \p{L&}+\p{Xsp} \p{L&}+\p{Xps} \p{Xwd}+\p{L&} \p{L&}+\p{Xuc}/Bx,ucp
1570 /\p{N}+\p{Any} \p{N}+\p{L&} \p{N}+\p{L} \p{N}+\P{L} \p{N}+\P{N} \p{N}+\p{Lu} \p{N}+\p{Han} \p{N}+\p{Xan} \p{N}+\p{Xsp} \p{N}+\p{Xps} \p{Xwd}+\p{N} \p{N}+\p{Xuc}/Bx,ucp
1572 /\p{Lu}+\p{Any} \p{Lu}+\p{L&} \p{Lu}+\p{L} \p{Lu}+\p{Lu} \P{Lu}+\p{Lu} \p{Lu}+\p{Nd} \p{Lu}+\P{Nd} \p{Lu}+\p{Han} \p{Lu}+\p{Xan} \p{Lu}+\p{Xsp} \p{Lu}+\p{Xps} \p{Xwd}+\p{Lu} \p{Lu}+\p{Xuc}/Bx,ucp
1574 /\p{Han}+\p{Lu} \p{Han}+\p{L&} \p{Han}+\p{L} \p{Han}+\p{Lu} \p{Han}+\p{Arabic} \p{Arabic}+\p{Arabic} \p{Han}+\p{Xan} \p{Han}+\p{Xsp} \p{Han}+\p{Xps} \p{Xwd}+\p{Han} \p{Han}+\p{Xuc}/Bx,ucp
1576 /\p{Xan}+\p{Any} \p{Xan}+\p{L&} \P{Xan}+\p{L&} \p{Xan}+\p{L} \p{Xan}+\p{Lu} \p{Xan}+\p{Han} \p{Xan}+\p{Xan} \p{Xan}+\P{Xan} \p{Xan}+\p{Xsp} \p{Xan}+\p{Xps} \p{Xwd}+\p{Xan} \p{Xan}+\p{Xuc}/Bx,ucp
1578 /\p{Xsp}+\p{Any} \p{Xsp}+\p{L&} \p{Xsp}+\p{L} \p{Xsp}+\p{Lu} \p{Xsp}+\p{Han} \p{Xsp}+\p{Xan} \p{Xsp}+\p{Xsp} \P{Xsp}+\p{Xsp} \p{Xsp}+\p{Xps} \p{Xwd}+\p{Xsp} \p{Xsp}+\p{Xuc}/Bx,ucp
1580 /\p{Xwd}+\p{Any} \p{Xwd}+\p{L&} \p{Xwd}+\p{L} \p{Xwd}+\p{Lu} \p{Xwd}+\p{Han} \p{Xwd}+\p{Xan} \p{Xwd}+\p{Xsp} \p{Xwd}+\p{Xps} \p{Xwd}+\p{Xwd} \p{Xwd}+\P{Xwd} \p{Xwd}+\p{Xuc}/Bx,ucp
1582 /\p{Xuc}+\p{Any} \p{Xuc}+\p{L&} \p{Xuc}+\p{L} \p{Xuc}+\p{Lu} \p{Xuc}+\p{Han} \p{Xuc}+\p{Xan} \p{Xuc}+\p{Xsp} \p{Xuc}+\p{Xps} \p{Xwd}+\p{Xuc} \p{Xuc}+\p{Xuc} \p{Xuc}+\P{Xuc}/Bx,ucp
1584 /\p{N}+\p{Ll} \p{N}+\p{Nd} \p{N}+\P{Nd}/Bx,ucp
1586 /\p{Xan}+\p{L} \p{Xan}+\p{N} \p{Xan}+\p{C} \p{Xan}+\P{L} \P{Xan}+\p{N} \p{Xan}+\P{C}/Bx,ucp
1588 /\p{L}+\p{Xan} \p{N}+\p{Xan} \p{C}+\p{Xan} \P{L}+\p{Xan} \p{N}+\p{Xan} \P{C}+\p{Xan} \p{L}+\P{Xan}/Bx,ucp
1590 /\p{Xan}+\p{Lu} \p{Xan}+\p{Nd} \p{Xan}+\p{Cc} \p{Xan}+\P{Ll} \P{Xan}+\p{No} \p{Xan}+\P{Cf}/Bx,ucp
1592 /\p{Lu}+\p{Xan} \p{Nd}+\p{Xan} \p{Cs}+\p{Xan} \P{Lt}+\p{Xan} \p{Nl}+\p{Xan} \P{Cc}+\p{Xan} \p{Lt}+\P{Xan}/Bx,ucp
1594 /\w+\p{P} \w+\p{Po} \w+\s \p{Xan}+\s \s+\p{Xan} \s+\w/Bx,ucp
1596 /\w+\P{P} \W+\p{Po} \w+\S \P{Xan}+\s \s+\P{Xan} \s+\W/Bx,ucp
1598 /\w+\p{Po} \w+\p{Pc} \W+\p{Po} \W+\p{Pc} \w+\P{Po} \w+\P{Pc}/Bx,ucp
1600 /\p{Nl}+\p{Xan} \P{Nl}+\p{Xan} \p{Nl}+\P{Xan} \P{Nl}+\P{Xan}/Bx,ucp
1602 /\p{Xan}+\p{Nl} \P{Xan}+\p{Nl} \p{Xan}+\P{Nl} \P{Xan}+\P{Nl}/Bx,ucp
1604 /\p{Xan}+\p{Nd} \P{Xan}+\p{Nd} \p{Xan}+\P{Nd} \P{Xan}+\P{Nd}/Bx,ucp
1606 # End auto-possessification tests
1608 /\w+/B,utf,ucp,auto_callout
1611 /[\p{N}]?+/B,no_auto_possess
1613 /[\p{L}ab]{2,3}+/B,no_auto_possess
1615 /\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
1621 /\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
1623 /\d+\s{0,5}=\s*\S?=\w{0,4}\W*/B,utf,ucp
1634 /\X?abc/utf,no_start_optimize
1635 \xff\x7f\x00\x00\x03\x00\x41\xcc\x80\x41\x{300}\x61\x62\x63\x00\=no_utf_check,offset=06
1637 /\x{100}\x{200}\K\x{300}/utf,startchar
1638 \x{100}\x{200}\x{300}
1640 # Test UTF characters in a substitution
1642 /ábc/utf,replace=XሴZ
1645 /(?<=abc)(|def)/g,utf,replace=<$0>
1646 123abcáyzabcdef789abcሴqr
1651 /(?<=\K\x{17f})/g,utf,aftertext
1652 \x{17f}\x{17f}\x{17f}\x{17f}\x{17f}
1654 /(?<=\K\x{17f})/altglobal,utf,aftertext
1655 \x{17f}\x{17f}\x{17f}\x{17f}\x{17f}
1657 "\xa\xf<(.\pZ*\P{Xwd}+^\xa8\3'3yq.::?(?J:()\xd1+!~:3'(8?:)':(?'d'(?'d'^u]!.+.+\\A\Ah(n+?9){7}+\K;(?'X'u'(?'c'(?'z'(?<y>\xb::\xf0'|\xd3(\xae?'w(z\x8?P>l)\x8?P>a)'\H\R\xd1+!!~:3'(?:h$N{26875}\W+?\\=D{2}\x89(?i:Uy0\N({2\xa(\v\x85*){y*\A(()\p{L}+?\P{^Xan}'+?\xff\+pS\?|).{;y*\A(()\p{L}+?\8}\d?1(|)(/1){7}.+[Lp{Me}].\s\xdcC*?(?(<y>))(?<!^)$C((;*?(R))+(\xbf(R))\x8a\X*?\x8a\xb\xd1^9\3*+(\xc1,\k'R'\xb4)\xcc(z\z(?J)(?'X'\x1b(\xb\xd1^9\?'3*+P{^Xan}+?\xff\+(\xc1.]k+\xb'Pm'\xb4)\xcc4f\xa7'\xd1V(?i:U,{2,2})'(?'X'))?-%--\x95$9*\4'|\xd1(\x9c''%\x94$9)#(?'R')3\x7?('P\xed7'\xa8\xb1^u\xeaw\1\0\0\(|(?1){7}.+[\p{Me}].\s\xdcC*^\x14?(?(<y>))(?<!^)$C((;*?(R*?))+(?(R)\x8a\X*?\x8a\xb\xd1^9\3*+|(\xc1,\k'R'\xb4)\xcc! z)\z(?JJ)(?'X';(\xb\xd1^9\?'3*+(\xc1.]k+\xb'Pm'\xb4))':(?'d')(?'RD'(d')|)|$)'|(?<x>\g{d});\g{x}\x11\g{d}\x81\|$((?'X'\'X'(?'W''\x92()'9'\x83*))\xba*\!?^ <){)':;\xcc4'\xd1'(?'X'28))?-%--\x95$9*\4'|\xd1((''e\x94*$9:)*#(?'R')3)\x7?('P\xed')\\x16:;()\x1e\x10*:(?<y>)\xd1+0!~:(?)'d'E:yD!\s(?'R'\x1e;\x10:U))|'\x9g!\xb0*){)\\x16:;()\x1e\x10\x87*:(?<y>)\xd1+!~:(?)'}'\d'E:yD!\s(?'R'\x1e;\x10:U))|'))|)g!\xb0*R+9{29+)#(?'P'})*?pS\{3,}\x85,{0,}l{*UTF)(\xe{7}){3722,{9,}d{2,?|))|{)\(A?&d}}{\xa,}2}){3,}7,l{)22}(,}l:7{2,4}}29\x19+)#?'P'})*v?))\x5"
1659 /$(&.+[\p{Me}].\s\xdcC*?(?(<y>))(?<!^)$C((;*?(R))+(?(R)){0,6}?|){12\x8a\X*?\x8a\x0b\xd1^9\3*+(\xc1,\k'P'\xb4)\xcc(z\z(?JJ)(?'X'8};(\x0b\xd1^9\?'3*+(\xc1.]k+\x0b'Pm'\xb4\xcc4'\xd1'(?'X'))?-%--\x95$9*\4'|\xd1(''%\x95*$9)#(?'R')3\x07?('P\xed')\\x16:;()\x1e\x10*:(?<y>)\xd1+!~:(?)''(d'E:yD!\s(?'R'\x1e;\x10:U))|')g!\xb0*){29+))#(?'P'})*?/
1661 "(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'"
1666 /(*:a\x{12345}b\t(d\)c)xxx/utf,alt_verbnames,mark
1669 /abcd/utf,replace=x\x{824}y\o{3333}z(\Q12\$34$$\x34\E5$$),substitute_extended
1672 /a(\x{e0}\x{101})(\x{c0}\x{102})/utf,replace=a\u$1\U$1\E$1\l$2\L$2\Eab\U\x{e0}\x{101}\L\x{d0}\x{160}\EDone,substitute_extended
1673 a\x{e0}\x{101}\x{c0}\x{102}
1675 /((?<digit>\d)|(?<letter>\p{L}))/g,substitute_extended,replace=<${digit:+digit; :not digit; }${letter:+letter:not a letter}>
1678 /(*UCP)(*UTF)[[:>:]]X/B
1680 /abc/utf,replace=xyz
1683 /a[[:punct:]b]/ucp,bincode
1685 /a[[:punct:]b]/utf,ucp,bincode
1687 /a[b[:punct:]]/utf,ucp,bincode
1689 /[[:^ascii:]]/utf,ucp,bincode
1691 /[[:^ascii:]\w]/utf,ucp,bincode
1693 /[\w[:^ascii:]]/utf,ucp,bincode
1695 /[^[:ascii:]\W]/utf,ucp,bincode
1702 /[[:^ascii:]a]/utf,ucp,bincode
1704 /L(?#(|++<!(2)?/B,utf,no_auto_possess,auto_callout
1706 /L(?#(|++<!(2)?/B,utf,ucp,auto_callout
1708 /(*UTF)C\x09((?<!'(?x)!*H? #\xcc\x9a[^$]/
1727 # Hex uses pattern length, not zero-terminated. This tests for overrunning
1728 # the given length of a pattern.
1732 /'#('/hex,extended,utf
1738 /../utf,auto_callout
1739 \n\x{123}\x{123}\x{123}\x{123}
1741 # This tests processing wide characters in extended mode.
1745 # These three test a bug fix that was not clearing up after a locale setting
1746 # when the test or a subsequent one matched a wide character.
1753 /[\P{Yi}]/utf,locale=C
1758 # Horizontal and vertical space lists ignore caseless
1770 /\p{Nd}{0,3}[\pL](*:abc)(?C1)xxx/callout_info
1772 # ---------------------------------------------------------------------------
1774 # A bunch of tests that hit lines of code that others do not (at least when
1775 # these were created).
1777 /^[^a]{3,}?x/i,utf,no_start_optimize,no_auto_possess
1782 /^[ac]{3,}?x/i,utf,no_start_optimize,no_auto_possess
1786 /^X\X/no_start_optimize,no_auto_possess
1790 /^X\p{L&}+?/no_start_optimize,no_auto_possess
1794 /^X\p{L}+?/no_start_optimize,no_auto_possess
1798 /^X\p{Lu}+?/no_start_optimize,no_auto_possess
1802 /^X\p{Arabic}+?/no_start_optimize,no_auto_possess
1806 /^X\p{Xan}+?/ucp,no_start_optimize,no_auto_possess
1810 /^X\s+?/ucp,no_start_optimize,no_auto_possess
1815 /^X\S+?/ucp,no_start_optimize,no_auto_possess
1820 /^X\w+?/ucp,no_start_optimize,no_auto_possess
1824 /^X[^\x{b5}]+?/i,utf,no_start_optimize,no_auto_possess
1828 /^X[\x{b5}]+?/i,utf,no_start_optimize,no_auto_possess
1832 /^X\p{Xuc}+?/utf,no_start_optimize,no_auto_possess
1836 /^X.+?Z/s,utf,no_start_optimize,no_auto_possess
1840 /^X\R+?/utf,no_start_optimize,no_auto_possess
1844 /^X\H+?/utf,no_start_optimize,no_auto_possess
1848 /^X\V+?/utf,no_start_optimize,no_auto_possess
1852 /^X\s+?/utf,no_start_optimize,no_auto_possess
1857 /^X\S+?/utf,no_start_optimize,no_auto_possess
1861 /^X\p{Any}{1,3}?Z/s,no_start_optimize,no_auto_possess
1869 /^X\p{L&}{1,3}?Z/s,no_start_optimize,no_auto_possess
1874 /^X\p{L}{1,3}?Z/s,no_start_optimize,no_auto_possess
1879 /^X\p{Lu}{1,3}?Z/s,no_start_optimize,no_auto_possess
1884 /^X\P{Han}{1,3}?Z/s,utf,no_start_optimize,no_auto_possess
1890 /^X\p{Xan}{1,3}?Z/s,no_start_optimize,no_auto_possess
1895 /^X\p{Xsp}{1,3}?Z/s,no_start_optimize,no_auto_possess
1901 /^X\P{Xsp}{1,3}?Z/s,no_start_optimize,no_auto_possess
1905 /^X\p{Xwd}{1,3}?Z/s,no_start_optimize,no_auto_possess
1911 /^X\x{b5}+?Z/i,utf,no_start_optimize,no_auto_possess
1917 /^X\p{Xuc}+?Z/utf,no_start_optimize,no_auto_possess
1923 /(*CRLF)^X.+?Z/utf,no_start_optimize,no_auto_possess
1924 \= Expect partial match
1929 /^X.+?Z/s,utf,no_start_optimize,no_auto_possess
1934 /^X\R+?Z/utf,no_start_optimize,no_auto_possess
1942 /(*BSR_ANYCRLF)^X\R+?Z/utf,no_start_optimize,no_auto_possess
1950 /^X\H+?Z/utf,no_start_optimize,no_auto_possess
1955 /^X\h+?Z/utf,no_start_optimize,no_auto_possess
1960 /^X\V+?Z/utf,no_start_optimize,no_auto_possess
1965 /^X\v+?Z/utf,no_start_optimize,no_auto_possess
1970 /^X\D+?Z/utf,no_start_optimize,no_auto_possess
1975 /^X\d+?Z/utf,no_start_optimize,no_auto_possess
1980 /^X\S+?Z/utf,no_start_optimize,no_auto_possess
1985 /^X\s+?Z/utf,no_start_optimize,no_auto_possess
1990 /^X\W+?Z/utf,no_start_optimize,no_auto_possess
1995 /^X\p{L&}{1,3}Z/no_start_optimize,no_auto_possess
2000 /^X\p{L}{1,3}Z/no_start_optimize,no_auto_possess
2004 /^X\p{Xan}{1,3}Z/no_start_optimize,no_auto_possess
2008 /^X\P{Xsp}{1,3}Z/no_start_optimize,no_auto_possess
2012 /^X\p{Xuc}+Z/utf,no_start_optimize,no_auto_possess
2016 # ----------------------------------------------------------------------
2017 # These test the dangerous PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL option.
2019 /\x{d800}/B,utf,bad_escape_is_literal
2021 /\ud800/B,utf,alt_bsux,bad_escape_is_literal
2023 # ----------------------------------------------------------------------
2025 /Aሴ+B/literal,utf,no_utf_check
2028 # These are here because I upgraded to Unicode 10.0.0 before Perl did, so it
2029 # doesn't recognize all these scripts. In time these three tests can be moved
2032 /^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+)
2033 (\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+)
2034 (\p{Zanabazar_Square}+)/x,utf
2035 \x{1E900}\x{1E924}\x{1E953}\x{11C00}\x{11C2D}\x{11C3E}\x{11C70}\x{11C77}\x{11CAB}\x{11400}\x{1142F}\x{11455}\x{104B0}\x{104D8}\x{104FB}\x{16FE0}\x{18800}\x{18AF2}\x{11D00}\x{11D3A}\x{11D59}\x{16FE1}\x{1B170}\x{1B2FB}\x{11A50}\x{11A58}\x{11AA2}\x{11A00}\x{11A07}\x{11A47}
2037 /^\x{1E900}\x{104B0}/i,utf
2041 /^(?:(\X)(?C))+$/utf
2042 \x{1E900}\x{1E924}\x{1E953}\x{11C00}\x{11C2D}\x{11C3E}\x{11C70}\x{11C77}\x{11CAB}\x{11400}\x{1142F}\x{11455}\x{104B0}\x{104D8}\x{104FB}\x{16FE0}\x{18800}\x{18AF2}\x{11D00}\x{11D3A}\x{11D59}\x{16FE1}\x{1B170}\x{1B2FB}\x{11A50}\x{11A58}\x{11AA2}\x{11A00}\x{11A07}\x{11A47}\=callout_capture,callout_no_where
2044 # Similarly for Unicode 11.0.0
2046 /^(\p{Dogra}+)(\p{Gunjala_Gondi}+)(\p{Hanifi_Rohingya}+)(\p{Makasar}+)
2047 (\p{Medefaidrin}+)(\p{Old_Sogdian}+)(\p{Sogdian}+)/x,utf
2048 \x{11800}\x{11da9}\x{10d27}\x{11ee0}\x{16e48}\x{10f27}\x{10f30}
2050 # These two are here because of differences from Perl.
2054 \x{261d}\x{261d}B Extended_Pictographic Extended_Pictographic
2055 \x{261D}\x{1F3FB}B Extended_Pictographic Extend
2056 \x{1F1E6}\x{1F1E7}B RegionalIndicator RegionalIndicator
2057 \x{261D}\x{1F3FB}\x{261d}B Extended_Pictographic Extend E-P
2058 \x{261D}\x{1F3FB}\x{200d}\x{261d}B Extended_Pictographic Extend ZWJ E-P
2060 # Regional indicators
2062 /^(\X)(\X)/utf,aftertext
2063 \x{1F1E6}\x{1F1E7}\x{1F1E7}B
2064 \x{1F1E6}\x{1F1E7}\x{1F1E7}\x{1F1E6}B
2066 # More differences from Perl
2096 # This tests the non-UTF Unicode NEL pattern whitespace character, only
2097 # recognized by PCRE2 with /x when there is Unicode support.
2103 # This tests Unicode Pattern White Space characters in verb names when they
2104 # are being processed with PCRE2_EXTENDED. Note: there are UTF-8 characters
2105 # with code points greater than 255 between A, B, and C in the pattern.
2107 /(*: AB
C)abc/x,utf,mark,alt_verbnames