From: kosako Date: Mon, 2 May 2016 08:57:58 +0000 (+0900) Subject: add doc/UNICODE_PROPERTIES X-Git-Tag: v6.0.0^2~21 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=929026d0e396bfc6bd5be536af2b062785d531a3;p=onig add doc/UNICODE_PROPERTIES --- diff --git a/doc/RE b/doc/RE index 21efe53..b4bf536 100644 --- a/doc/RE +++ b/doc/RE @@ -1,4 +1,4 @@ -Oniguruma Regular Expressions Version 5.9.1 2007/09/05 +Oniguruma Regular Expressions Version 6.0.0 2016/05/02 syntax: ONIG_SYNTAX_RUBY (default) @@ -86,19 +86,7 @@ syntax: ONIG_SYNTAX_RUBY (default) Hiragana, Katakana + works on UTF8, UTF16, UTF32 - Any, Assigned, C, Cc, Cf, Cn, Co, Cs, L, Ll, Lm, Lo, Lt, Lu, - M, Mc, Me, Mn, N, Nd, Nl, No, P, Pc, Pd, Pe, Pf, Pi, Po, Ps, - S, Sc, Sk, Sm, So, Z, Zl, Zp, Zs, - Arabic, Armenian, Bengali, Bopomofo, Braille, Buginese, - Buhid, Canadian_Aboriginal, Cherokee, Common, Coptic, - Cypriot, Cyrillic, Deseret, Devanagari, Ethiopic, Georgian, - Glagolitic, Gothic, Greek, Gujarati, Gurmukhi, Han, Hangul, - Hanunoo, Hebrew, Hiragana, Inherited, Kannada, Katakana, - Kharoshthi, Khmer, Lao, Latin, Limbu, Linear_B, Malayalam, - Mongolian, Myanmar, New_Tai_Lue, Ogham, Old_Italic, Old_Persian, - Oriya, Osmanya, Runic, Shavian, Sinhala, Syloti_Nagri, Syriac, - Tagalog, Tagbanwa, Tai_Le, Tamil, Telugu, Thaana, Thai, Tibetan, - Tifinagh, Ugaritic, Yi + See doc/UNICODE_PROPERTIES. diff --git a/doc/RE.ja b/doc/RE.ja index abde849..bc877f2 100644 --- a/doc/RE.ja +++ b/doc/RE.ja @@ -1,4 +1,4 @@ -µ´¼Ö Àµµ¬É½¸½ Version 5.9.1 2007/09/05 +µ´¼Ö Àµµ¬É½¸½ Version 6.0.0 2016/05/02 »ÈÍÑʸˡ: ONIG_SYNTAX_RUBY (´ûÄêÃÍ) @@ -86,19 +86,7 @@ Hiragana, Katakana + UTF8, UTF16, UTF32¤ÇÍ­¸ú - Any, Assigned, C, Cc, Cf, Cn, Co, Cs, L, Ll, Lm, Lo, Lt, Lu, - M, Mc, Me, Mn, N, Nd, Nl, No, P, Pc, Pd, Pe, Pf, Pi, Po, Ps, - S, Sc, Sk, Sm, So, Z, Zl, Zp, Zs, - Arabic, Armenian, Bengali, Bopomofo, Braille, Buginese, - Buhid, Canadian_Aboriginal, Cherokee, Common, Coptic, - Cypriot, Cyrillic, Deseret, Devanagari, Ethiopic, Georgian, - Glagolitic, Gothic, Greek, Gujarati, Gurmukhi, Han, Hangul, - Hanunoo, Hebrew, Hiragana, Inherited, Kannada, Katakana, - Kharoshthi, Khmer, Lao, Latin, Limbu, Linear_B, Malayalam, - Mongolian, Myanmar, New_Tai_Lue, Ogham, Old_Italic, Old_Persian, - Oriya, Osmanya, Runic, Shavian, Sinhala, Syloti_Nagri, Syriac, - Tagalog, Tagbanwa, Tai_Le, Tamil, Telugu, Thaana, Thai, Tibetan, - Tifinagh, Ugaritic, Yi + doc/UNICODE_PROPERTIES»²¾È diff --git a/doc/UNICODE_PROPERTIES b/doc/UNICODE_PROPERTIES new file mode 100644 index 0000000..3634e4d --- /dev/null +++ b/doc/UNICODE_PROPERTIES @@ -0,0 +1,677 @@ +Unicode Properties (from Unicode Version: 7.0.0) + + 1: Any + 2: Assigned + 3: C + 4: Cc + 5: Cf + 6: Cn + 7: Co + 8: Cs + 9: L + 10: LC + 11: Ll + 12: Lm + 13: Lo + 14: Lt + 15: Lu + 16: M + 17: Mc + 18: Me + 19: Mn + 20: N + 21: Nd + 22: Nl + 23: No + 24: P + 25: Pc + 26: Pd + 27: Pe + 28: Pf + 29: Pi + 30: Po + 31: Ps + 32: S + 33: Sc + 34: Sk + 35: Sm + 36: So + 37: Z + 38: Zl + 39: Zp + 40: Zs + 41: Math + 42: Alphabetic + 43: Lowercase + 44: Uppercase + 45: Cased + 46: Case_Ignorable + 47: Changes_When_Lowercased + 48: Changes_When_Uppercased + 49: Changes_When_Titlecased + 50: Changes_When_Casefolded + 51: Changes_When_Casemapped + 52: ID_Start + 53: ID_Continue + 54: XID_Start + 55: XID_Continue + 56: Default_Ignorable_Code_Point + 57: Grapheme_Extend + 58: Grapheme_Base + 59: Grapheme_Link + 60: Common + 61: Latin + 62: Greek + 63: Cyrillic + 64: Armenian + 65: Hebrew + 66: Arabic + 67: Syriac + 68: Thaana + 69: Devanagari + 70: Bengali + 71: Gurmukhi + 72: Gujarati + 73: Oriya + 74: Tamil + 75: Telugu + 76: Kannada + 77: Malayalam + 78: Sinhala + 79: Thai + 80: Lao + 81: Tibetan + 82: Myanmar + 83: Georgian + 84: Hangul + 85: Ethiopic + 86: Cherokee + 87: Canadian_Aboriginal + 88: Ogham + 89: Runic + 90: Khmer + 91: Mongolian + 92: Hiragana + 93: Katakana + 94: Bopomofo + 95: Han + 96: Yi + 97: Old_Italic + 98: Gothic + 99: Deseret +100: Inherited +101: Tagalog +102: Hanunoo +103: Buhid +104: Tagbanwa +105: Limbu +106: Tai_Le +107: Linear_B +108: Ugaritic +109: Shavian +110: Osmanya +111: Cypriot +112: Braille +113: Buginese +114: Coptic +115: New_Tai_Lue +116: Glagolitic +117: Tifinagh +118: Syloti_Nagri +119: Old_Persian +120: Kharoshthi +121: Balinese +122: Cuneiform +123: Phoenician +124: Phags_Pa +125: Nko +126: Sundanese +127: Lepcha +128: Ol_Chiki +129: Vai +130: Saurashtra +131: Kayah_Li +132: Rejang +133: Lycian +134: Carian +135: Lydian +136: Cham +137: Tai_Tham +138: Tai_Viet +139: Avestan +140: Egyptian_Hieroglyphs +141: Samaritan +142: Lisu +143: Bamum +144: Javanese +145: Meetei_Mayek +146: Imperial_Aramaic +147: Old_South_Arabian +148: Inscriptional_Parthian +149: Inscriptional_Pahlavi +150: Old_Turkic +151: Kaithi +152: Batak +153: Brahmi +154: Mandaic +155: Chakma +156: Meroitic_Cursive +157: Meroitic_Hieroglyphs +158: Miao +159: Sharada +160: Sora_Sompeng +161: Takri +162: Caucasian_Albanian +163: Bassa_Vah +164: Duployan +165: Elbasan +166: Grantha +167: Pahawh_Hmong +168: Khojki +169: Linear_A +170: Mahajani +171: Manichaean +172: Mende_Kikakui +173: Modi +174: Mro +175: Old_North_Arabian +176: Nabataean +177: Palmyrene +178: Pau_Cin_Hau +179: Old_Permic +180: Psalter_Pahlavi +181: Siddham +182: Khudawadi +183: Tirhuta +184: Warang_Citi +185: White_Space +186: Bidi_Control +187: Join_Control +188: Dash +189: Hyphen +190: Quotation_Mark +191: Terminal_Punctuation +192: Other_Math +193: Hex_Digit +194: ASCII_Hex_Digit +195: Other_Alphabetic +196: Ideographic +197: Diacritic +198: Extender +199: Other_Lowercase +200: Other_Uppercase +201: Noncharacter_Code_Point +202: Other_Grapheme_Extend +203: IDS_Binary_Operator +204: IDS_Trinary_Operator +205: Radical +206: Unified_Ideograph +207: Other_Default_Ignorable_Code_Point +208: Deprecated +209: Soft_Dotted +210: Logical_Order_Exception +211: Other_ID_Start +212: Other_ID_Continue +213: STerm +214: Variation_Selector +215: Pattern_White_Space +216: Pattern_Syntax +217: Unknown +218: Aghb +219: AHex +220: Arab +221: Armi +222: Armn +223: Avst +224: Bali +225: Bamu +226: Bass +227: Batk +228: Beng +229: Bidi_C +230: Bopo +231: Brah +232: Brai +233: Bugi +234: Buhd +235: Cakm +236: Cans +237: Cari +238: Cased_Letter +239: Cher +240: CI +241: Close_Punctuation +242: Combining_Mark +243: Connector_Punctuation +244: Control +245: Copt +246: Cprt +247: Currency_Symbol +248: CWCF +249: CWCM +250: CWL +251: CWT +252: CWU +253: Cyrl +254: Dash_Punctuation +255: Decimal_Number +256: Dep +257: Deva +258: DI +259: Dia +260: Dsrt +261: Dupl +262: Egyp +263: Elba +264: Enclosing_Mark +265: Ethi +266: Ext +267: Final_Punctuation +268: Format +269: Geor +270: Glag +271: Goth +272: Gran +273: Gr_Base +274: Grek +275: Gr_Ext +276: Gr_Link +277: Gujr +278: Guru +279: Hang +280: Hani +281: Hano +282: Hebr +283: Hex +284: Hira +285: Hmng +286: IDC +287: Ideo +288: IDS +289: IDSB +290: IDST +291: Initial_Punctuation +292: Ital +293: Java +294: Join_C +295: Kali +296: Kana +297: Khar +298: Khmr +299: Khoj +300: Knda +301: Kthi +302: Lana +303: Laoo +304: Latn +305: Lepc +306: Letter +307: Letter_Number +308: Limb +309: Lina +310: Linb +311: Line_Separator +312: LOE +313: Lowercase_Letter +314: Lyci +315: Lydi +316: Mahj +317: Mand +318: Mani +319: Mark +320: Math_Symbol +321: Mend +322: Merc +323: Mero +324: Mlym +325: Modifier_Letter +326: Modifier_Symbol +327: Mong +328: Mroo +329: Mtei +330: Mymr +331: Narb +332: Nbat +333: NChar +334: Nkoo +335: Nonspacing_Mark +336: Number +337: OAlpha +338: ODI +339: Ogam +340: OGr_Ext +341: OIDC +342: OIDS +343: Olck +344: OLower +345: OMath +346: Open_Punctuation +347: Orkh +348: Orya +349: Osma +350: Other +351: Other_Letter +352: Other_Number +353: Other_Punctuation +354: Other_Symbol +355: OUpper +356: Palm +357: Paragraph_Separator +358: Pat_Syn +359: Pat_WS +360: Pauc +361: Perm +362: Phag +363: Phli +364: Phlp +365: Phnx +366: Plrd +367: Private_Use +368: Prti +369: Punctuation +370: Qaac +371: Qaai +372: QMark +373: Rjng +374: Runr +375: Samr +376: Sarb +377: Saur +378: SD +379: Separator +380: Shaw +381: Shrd +382: Sidd +383: Sind +384: Sinh +385: Sora +386: Space_Separator +387: Spacing_Mark +388: Sund +389: Surrogate +390: Sylo +391: Symbol +392: Syrc +393: Tagb +394: Takr +395: Tale +396: Talu +397: Taml +398: Tavt +399: Telu +400: Term +401: Tfng +402: Tglg +403: Thaa +404: Tibt +405: Tirh +406: Titlecase_Letter +407: Ugar +408: UIdeo +409: Unassigned +410: Uppercase_Letter +411: Vaii +412: VS +413: Wara +414: WSpace +415: XIDC +416: XIDS +417: Xpeo +418: Xsux +419: Yiii +420: Zinh +421: Zyyy +422: Zzzz +423: In_Basic_Latin +424: In_Latin_1_Supplement +425: In_Latin_Extended_A +426: In_Latin_Extended_B +427: In_IPA_Extensions +428: In_Spacing_Modifier_Letters +429: In_Combining_Diacritical_Marks +430: In_Greek_and_Coptic +431: In_Cyrillic +432: In_Cyrillic_Supplement +433: In_Armenian +434: In_Hebrew +435: In_Arabic +436: In_Syriac +437: In_Arabic_Supplement +438: In_Thaana +439: In_NKo +440: In_Samaritan +441: In_Mandaic +442: In_Arabic_Extended_A +443: In_Devanagari +444: In_Bengali +445: In_Gurmukhi +446: In_Gujarati +447: In_Oriya +448: In_Tamil +449: In_Telugu +450: In_Kannada +451: In_Malayalam +452: In_Sinhala +453: In_Thai +454: In_Lao +455: In_Tibetan +456: In_Myanmar +457: In_Georgian +458: In_Hangul_Jamo +459: In_Ethiopic +460: In_Ethiopic_Supplement +461: In_Cherokee +462: In_Unified_Canadian_Aboriginal_Syllabics +463: In_Ogham +464: In_Runic +465: In_Tagalog +466: In_Hanunoo +467: In_Buhid +468: In_Tagbanwa +469: In_Khmer +470: In_Mongolian +471: In_Unified_Canadian_Aboriginal_Syllabics_Extended +472: In_Limbu +473: In_Tai_Le +474: In_New_Tai_Lue +475: In_Khmer_Symbols +476: In_Buginese +477: In_Tai_Tham +478: In_Combining_Diacritical_Marks_Extended +479: In_Balinese +480: In_Sundanese +481: In_Batak +482: In_Lepcha +483: In_Ol_Chiki +484: In_Sundanese_Supplement +485: In_Vedic_Extensions +486: In_Phonetic_Extensions +487: In_Phonetic_Extensions_Supplement +488: In_Combining_Diacritical_Marks_Supplement +489: In_Latin_Extended_Additional +490: In_Greek_Extended +491: In_General_Punctuation +492: In_Superscripts_and_Subscripts +493: In_Currency_Symbols +494: In_Combining_Diacritical_Marks_for_Symbols +495: In_Letterlike_Symbols +496: In_Number_Forms +497: In_Arrows +498: In_Mathematical_Operators +499: In_Miscellaneous_Technical +500: In_Control_Pictures +501: In_Optical_Character_Recognition +502: In_Enclosed_Alphanumerics +503: In_Box_Drawing +504: In_Block_Elements +505: In_Geometric_Shapes +506: In_Miscellaneous_Symbols +507: In_Dingbats +508: In_Miscellaneous_Mathematical_Symbols_A +509: In_Supplemental_Arrows_A +510: In_Braille_Patterns +511: In_Supplemental_Arrows_B +512: In_Miscellaneous_Mathematical_Symbols_B +513: In_Supplemental_Mathematical_Operators +514: In_Miscellaneous_Symbols_and_Arrows +515: In_Glagolitic +516: In_Latin_Extended_C +517: In_Coptic +518: In_Georgian_Supplement +519: In_Tifinagh +520: In_Ethiopic_Extended +521: In_Cyrillic_Extended_A +522: In_Supplemental_Punctuation +523: In_CJK_Radicals_Supplement +524: In_Kangxi_Radicals +525: In_Ideographic_Description_Characters +526: In_CJK_Symbols_and_Punctuation +527: In_Hiragana +528: In_Katakana +529: In_Bopomofo +530: In_Hangul_Compatibility_Jamo +531: In_Kanbun +532: In_Bopomofo_Extended +533: In_CJK_Strokes +534: In_Katakana_Phonetic_Extensions +535: In_Enclosed_CJK_Letters_and_Months +536: In_CJK_Compatibility +537: In_CJK_Unified_Ideographs_Extension_A +538: In_Yijing_Hexagram_Symbols +539: In_CJK_Unified_Ideographs +540: In_Yi_Syllables +541: In_Yi_Radicals +542: In_Lisu +543: In_Vai +544: In_Cyrillic_Extended_B +545: In_Bamum +546: In_Modifier_Tone_Letters +547: In_Latin_Extended_D +548: In_Syloti_Nagri +549: In_Common_Indic_Number_Forms +550: In_Phags_pa +551: In_Saurashtra +552: In_Devanagari_Extended +553: In_Kayah_Li +554: In_Rejang +555: In_Hangul_Jamo_Extended_A +556: In_Javanese +557: In_Myanmar_Extended_B +558: In_Cham +559: In_Myanmar_Extended_A +560: In_Tai_Viet +561: In_Meetei_Mayek_Extensions +562: In_Ethiopic_Extended_A +563: In_Latin_Extended_E +564: In_Meetei_Mayek +565: In_Hangul_Syllables +566: In_Hangul_Jamo_Extended_B +567: In_High_Surrogates +568: In_High_Private_Use_Surrogates +569: In_Low_Surrogates +570: In_Private_Use_Area +571: In_CJK_Compatibility_Ideographs +572: In_Alphabetic_Presentation_Forms +573: In_Arabic_Presentation_Forms_A +574: In_Variation_Selectors +575: In_Vertical_Forms +576: In_Combining_Half_Marks +577: In_CJK_Compatibility_Forms +578: In_Small_Form_Variants +579: In_Arabic_Presentation_Forms_B +580: In_Halfwidth_and_Fullwidth_Forms +581: In_Specials +582: In_Linear_B_Syllabary +583: In_Linear_B_Ideograms +584: In_Aegean_Numbers +585: In_Ancient_Greek_Numbers +586: In_Ancient_Symbols +587: In_Phaistos_Disc +588: In_Lycian +589: In_Carian +590: In_Coptic_Epact_Numbers +591: In_Old_Italic +592: In_Gothic +593: In_Old_Permic +594: In_Ugaritic +595: In_Old_Persian +596: In_Deseret +597: In_Shavian +598: In_Osmanya +599: In_Elbasan +600: In_Caucasian_Albanian +601: In_Linear_A +602: In_Cypriot_Syllabary +603: In_Imperial_Aramaic +604: In_Palmyrene +605: In_Nabataean +606: In_Phoenician +607: In_Lydian +608: In_Meroitic_Hieroglyphs +609: In_Meroitic_Cursive +610: In_Kharoshthi +611: In_Old_South_Arabian +612: In_Old_North_Arabian +613: In_Manichaean +614: In_Avestan +615: In_Inscriptional_Parthian +616: In_Inscriptional_Pahlavi +617: In_Psalter_Pahlavi +618: In_Old_Turkic +619: In_Rumi_Numeral_Symbols +620: In_Brahmi +621: In_Kaithi +622: In_Sora_Sompeng +623: In_Chakma +624: In_Mahajani +625: In_Sharada +626: In_Sinhala_Archaic_Numbers +627: In_Khojki +628: In_Khudawadi +629: In_Grantha +630: In_Tirhuta +631: In_Siddham +632: In_Modi +633: In_Takri +634: In_Warang_Citi +635: In_Pau_Cin_Hau +636: In_Cuneiform +637: In_Cuneiform_Numbers_and_Punctuation +638: In_Egyptian_Hieroglyphs +639: In_Bamum_Supplement +640: In_Mro +641: In_Bassa_Vah +642: In_Pahawh_Hmong +643: In_Miao +644: In_Kana_Supplement +645: In_Duployan +646: In_Shorthand_Format_Controls +647: In_Byzantine_Musical_Symbols +648: In_Musical_Symbols +649: In_Ancient_Greek_Musical_Notation +650: In_Tai_Xuan_Jing_Symbols +651: In_Counting_Rod_Numerals +652: In_Mathematical_Alphanumeric_Symbols +653: In_Mende_Kikakui +654: In_Arabic_Mathematical_Alphabetic_Symbols +655: In_Mahjong_Tiles +656: In_Domino_Tiles +657: In_Playing_Cards +658: In_Enclosed_Alphanumeric_Supplement +659: In_Enclosed_Ideographic_Supplement +660: In_Miscellaneous_Symbols_and_Pictographs +661: In_Emoticons +662: In_Ornamental_Dingbats +663: In_Transport_and_Map_Symbols +664: In_Alchemical_Symbols +665: In_Geometric_Shapes_Extended +666: In_Supplemental_Arrows_C +667: In_CJK_Unified_Ideographs_Extension_B +668: In_CJK_Unified_Ideographs_Extension_C +669: In_CJK_Unified_Ideographs_Extension_D +670: In_CJK_Compatibility_Ideographs_Supplement +671: In_Tags +672: In_Variation_Selectors_Supplement +673: In_Supplementary_Private_Use_Area_A +674: In_Supplementary_Private_Use_Area_B +675: In_No_Block diff --git a/src/.gitignore b/src/.gitignore index d13fee3..e9781fc 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -3,6 +3,7 @@ config.h CaseFolding.txt unicode_fold?_key.gperf unicode_unfold_key.gperf +UNICODE_PROPERTIES *.o *.so *.lo diff --git a/src/make_unicode_property_data.py b/src/make_unicode_property_data.py index 9826279..25ed092 100755 --- a/src/make_unicode_property_data.py +++ b/src/make_unicode_property_data.py @@ -386,6 +386,15 @@ def set_max_prop_name(name): if n > PROPERTY_NAME_MAX_LEN: PROPERTY_NAME_MAX_LEN = n +LIST_COUNTER = 1 +def entry_prop_name(name, index): + global LIST_COUNTER + set_max_prop_name(name) + if OUTPUT_LIST and index >= len(POSIX_LIST): + print >> UPF, "%3d: %s" % (LIST_COUNTER, name) + LIST_COUNTER += 1 + + ### main ### argv = sys.argv argc = len(argv) @@ -395,6 +404,8 @@ if argc >= 2: if argv[1] == '-posix': POSIX_ONLY = True +OUTPUT_LIST = not(POSIX_ONLY) + with open('UnicodeData.txt', 'r') as f: dic, assigned = parse_unicode_data_file(f) DIC = dic @@ -478,17 +489,23 @@ struct PropertyNameCtype { ''' sys.stdout.write(s) +if OUTPUT_LIST: + UPF = open("UNICODE_PROPERTIES", "w") + if VERSION_INFO is not None: + print >> UPF, "Unicode Properties (from Unicode Version: %s)" % VERSION_INFO + print >> UPF, '' + index = -1 for prop in POSIX_LIST: index += 1 - set_max_prop_name(prop) + entry_prop_name(prop, index) prop = normalize_prop_name(prop) print_prop_and_index(prop, index) if not(POSIX_ONLY): for prop in PROPS: index += 1 - set_max_prop_name(prop) + entry_prop_name(prop, index) prop = normalize_prop_name(prop) print_prop_and_index(prop, index) @@ -504,12 +521,12 @@ if not(POSIX_ONLY): #print >> sys.stderr, "ALIASES: value is not exist: %s => %s" % (k, v) continue - set_max_prop_name(k) + entry_prop_name(k, index) print_prop_and_index(nk, index) for name in BLOCKS: index += 1 - set_max_prop_name(name) + entry_prop_name(name, index) name = normalize_prop_name(name) print_prop_and_index(name, index) @@ -522,4 +539,7 @@ if VERSION_INFO is not None: print "#define PROPERTY_NAME_MAX_SIZE %d" % (PROPERTY_NAME_MAX_LEN + 10) print "#define CODE_RANGES_NUM %d" % (index + 1) +if OUTPUT_LIST: + UPF.close() + sys.exit(0)