pcre2_ucp.h

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9      Original API code Copyright (c) 1997-2012 University of Cambridge
  10           New API code Copyright (c) 2016-2018 University of Cambridge
  11
  12 -----------------------------------------------------------------------------
  13 Redistribution and use in source and binary forms, with or without
  14 modification, are permitted provided that the following conditions are met:
  15
  16     * Redistributions of source code must retain the above copyright notice,
  17       this list of conditions and the following disclaimer.
  18
  19     * Redistributions in binary form must reproduce the above copyright
  20       notice, this list of conditions and the following disclaimer in the
  21       documentation and/or other materials provided with the distribution.
  22
  23     * Neither the name of the University of Cambridge nor the names of its
  24       contributors may be used to endorse or promote products derived from
  25       this software without specific prior written permission.
  26
  27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  37 POSSIBILITY OF SUCH DAMAGE.
  38 -----------------------------------------------------------------------------
  39 */
  40
  41
  42 #ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
  43 #define PCRE2_UCP_H_IDEMPOTENT_GUARD
  44
  45 /* This file contains definitions of the property values that are returned by
  46 the UCD access macros. New values that are added for new releases of Unicode
  47 should always be at the end of each enum, for backwards compatibility.
  48
  49 IMPORTANT: Note also that the specific numeric values of the enums have to be
  50 the same as the values that are generated by the maint/MultiStage2.py script,
  51 where the equivalent property descriptive names are listed in vectors.
  52
  53 ALSO: The specific values of the first two enums are assumed for the table
  54 called catposstab in pcre2_compile.c. */
  55
  56 /* These are the general character categories. */
  57
  58 enum {
  59   ucp_C,     /* Other */
  60   ucp_L,     /* Letter */
  61   ucp_M,     /* Mark */
  62   ucp_N,     /* Number */
  63   ucp_P,     /* Punctuation */
  64   ucp_S,     /* Symbol */
  65   ucp_Z      /* Separator */
  66 };
  67
  68 /* These are the particular character categories. */
  69
  70 enum {
  71   ucp_Cc,    /* Control */
  72   ucp_Cf,    /* Format */
  73   ucp_Cn,    /* Unassigned */
  74   ucp_Co,    /* Private use */
  75   ucp_Cs,    /* Surrogate */
  76   ucp_Ll,    /* Lower case letter */
  77   ucp_Lm,    /* Modifier letter */
  78   ucp_Lo,    /* Other letter */
  79   ucp_Lt,    /* Title case letter */
  80   ucp_Lu,    /* Upper case letter */
  81   ucp_Mc,    /* Spacing mark */
  82   ucp_Me,    /* Enclosing mark */
  83   ucp_Mn,    /* Non-spacing mark */
  84   ucp_Nd,    /* Decimal number */
  85   ucp_Nl,    /* Letter number */
  86   ucp_No,    /* Other number */
  87   ucp_Pc,    /* Connector punctuation */
  88   ucp_Pd,    /* Dash punctuation */
  89   ucp_Pe,    /* Close punctuation */
  90   ucp_Pf,    /* Final punctuation */
  91   ucp_Pi,    /* Initial punctuation */
  92   ucp_Po,    /* Other punctuation */
  93   ucp_Ps,    /* Open punctuation */
  94   ucp_Sc,    /* Currency symbol */
  95   ucp_Sk,    /* Modifier symbol */
  96   ucp_Sm,    /* Mathematical symbol */
  97   ucp_So,    /* Other symbol */
  98   ucp_Zl,    /* Line separator */
  99   ucp_Zp,    /* Paragraph separator */
 100   ucp_Zs     /* Space separator */
 101 };
 102
 103 /* These are grapheme break properties. The Extended Pictographic property
 104 comes from the emoji-data.txt file. */
 105
 106 enum {
 107   ucp_gbCR,                    /*  0 */
 108   ucp_gbLF,                    /*  1 */
 109   ucp_gbControl,               /*  2 */
 110   ucp_gbExtend,                /*  3 */
 111   ucp_gbPrepend,               /*  4 */
 112   ucp_gbSpacingMark,           /*  5 */
 113   ucp_gbL,                     /*  6 Hangul syllable type L */
 114   ucp_gbV,                     /*  7 Hangul syllable type V */
 115   ucp_gbT,                     /*  8 Hangul syllable type T */
 116   ucp_gbLV,                    /*  9 Hangul syllable type LV */
 117   ucp_gbLVT,                   /* 10 Hangul syllable type LVT */
 118   ucp_gbRegionalIndicator,     /* 11 */
 119   ucp_gbOther,                 /* 12 */
 120   ucp_gbZWJ,                   /* 13 */
 121   ucp_gbExtended_Pictographic  /* 14 */
 122 };
 123
 124 /* These are the script identifications. */
 125
 126 enum {
 127   ucp_Arabic,
 128   ucp_Armenian,
 129   ucp_Bengali,
 130   ucp_Bopomofo,
 131   ucp_Braille,
 132   ucp_Buginese,
 133   ucp_Buhid,
 134   ucp_Canadian_Aboriginal,
 135   ucp_Cherokee,
 136   ucp_Common,
 137   ucp_Coptic,
 138   ucp_Cypriot,
 139   ucp_Cyrillic,
 140   ucp_Deseret,
 141   ucp_Devanagari,
 142   ucp_Ethiopic,
 143   ucp_Georgian,
 144   ucp_Glagolitic,
 145   ucp_Gothic,
 146   ucp_Greek,
 147   ucp_Gujarati,
 148   ucp_Gurmukhi,
 149   ucp_Han,
 150   ucp_Hangul,
 151   ucp_Hanunoo,
 152   ucp_Hebrew,
 153   ucp_Hiragana,
 154   ucp_Inherited,
 155   ucp_Kannada,
 156   ucp_Katakana,
 157   ucp_Kharoshthi,
 158   ucp_Khmer,
 159   ucp_Lao,
 160   ucp_Latin,
 161   ucp_Limbu,
 162   ucp_Linear_B,
 163   ucp_Malayalam,
 164   ucp_Mongolian,
 165   ucp_Myanmar,
 166   ucp_New_Tai_Lue,
 167   ucp_Ogham,
 168   ucp_Old_Italic,
 169   ucp_Old_Persian,
 170   ucp_Oriya,
 171   ucp_Osmanya,
 172   ucp_Runic,
 173   ucp_Shavian,
 174   ucp_Sinhala,
 175   ucp_Syloti_Nagri,
 176   ucp_Syriac,
 177   ucp_Tagalog,
 178   ucp_Tagbanwa,
 179   ucp_Tai_Le,
 180   ucp_Tamil,
 181   ucp_Telugu,
 182   ucp_Thaana,
 183   ucp_Thai,
 184   ucp_Tibetan,
 185   ucp_Tifinagh,
 186   ucp_Ugaritic,
 187   ucp_Yi,
 188   /* New for Unicode 5.0 */
 189   ucp_Balinese,
 190   ucp_Cuneiform,
 191   ucp_Nko,
 192   ucp_Phags_Pa,
 193   ucp_Phoenician,
 194   /* New for Unicode 5.1 */
 195   ucp_Carian,
 196   ucp_Cham,
 197   ucp_Kayah_Li,
 198   ucp_Lepcha,
 199   ucp_Lycian,
 200   ucp_Lydian,
 201   ucp_Ol_Chiki,
 202   ucp_Rejang,
 203   ucp_Saurashtra,
 204   ucp_Sundanese,
 205   ucp_Vai,
 206   /* New for Unicode 5.2 */
 207   ucp_Avestan,
 208   ucp_Bamum,
 209   ucp_Egyptian_Hieroglyphs,
 210   ucp_Imperial_Aramaic,
 211   ucp_Inscriptional_Pahlavi,
 212   ucp_Inscriptional_Parthian,
 213   ucp_Javanese,
 214   ucp_Kaithi,
 215   ucp_Lisu,
 216   ucp_Meetei_Mayek,
 217   ucp_Old_South_Arabian,
 218   ucp_Old_Turkic,
 219   ucp_Samaritan,
 220   ucp_Tai_Tham,
 221   ucp_Tai_Viet,
 222   /* New for Unicode 6.0.0 */
 223   ucp_Batak,
 224   ucp_Brahmi,
 225   ucp_Mandaic,
 226   /* New for Unicode 6.1.0 */
 227   ucp_Chakma,
 228   ucp_Meroitic_Cursive,
 229   ucp_Meroitic_Hieroglyphs,
 230   ucp_Miao,
 231   ucp_Sharada,
 232   ucp_Sora_Sompeng,
 233   ucp_Takri,
 234   /* New for Unicode 7.0.0 */
 235   ucp_Bassa_Vah,
 236   ucp_Caucasian_Albanian,
 237   ucp_Duployan,
 238   ucp_Elbasan,
 239   ucp_Grantha,
 240   ucp_Khojki,
 241   ucp_Khudawadi,
 242   ucp_Linear_A,
 243   ucp_Mahajani,
 244   ucp_Manichaean,
 245   ucp_Mende_Kikakui,
 246   ucp_Modi,
 247   ucp_Mro,
 248   ucp_Nabataean,
 249   ucp_Old_North_Arabian,
 250   ucp_Old_Permic,
 251   ucp_Pahawh_Hmong,
 252   ucp_Palmyrene,
 253   ucp_Psalter_Pahlavi,
 254   ucp_Pau_Cin_Hau,
 255   ucp_Siddham,
 256   ucp_Tirhuta,
 257   ucp_Warang_Citi,
 258   /* New for Unicode 8.0.0 */
 259   ucp_Ahom,
 260   ucp_Anatolian_Hieroglyphs,
 261   ucp_Hatran,
 262   ucp_Multani,
 263   ucp_Old_Hungarian,
 264   ucp_SignWriting,
 265   /* New for Unicode 10.0.0 (no update since 8.0.0) */
 266   ucp_Adlam,
 267   ucp_Bhaiksuki,
 268   ucp_Marchen,
 269   ucp_Newa,
 270   ucp_Osage,
 271   ucp_Tangut,
 272   ucp_Masaram_Gondi,
 273   ucp_Nushu,
 274   ucp_Soyombo,
 275   ucp_Zanabazar_Square,
 276   /* New for Unicode 11.0.0 */
 277   ucp_Dogra,
 278   ucp_Gunjala_Gondi,
 279   ucp_Hanifi_Rohingya,
 280   ucp_Makasar,
 281   ucp_Medefaidrin,
 282   ucp_Old_Sogdian,
 283   ucp_Sogdian
 284 };
 285
 286 #endif  /* PCRE2_UCP_H_IDEMPOTENT_GUARD */
 287
 288 /* End of pcre2_ucp.h */