-Oniguruma Regular Expressions Version 5.9.1 2007/09/05
+Oniguruma Regular Expressions Version 6.0.0 2016/05/02
syntax: ONIG_SYNTAX_RUBY (default)
Hiragana, Katakana
+ works on UTF8, UTF16, UTF32
- Any, Assigned, C, Cc, Cf, Cn, Co, Cs, L, Ll, Lm, Lo, Lt, Lu,
- M, Mc, Me, Mn, N, Nd, Nl, No, P, Pc, Pd, Pe, Pf, Pi, Po, Ps,
- S, Sc, Sk, Sm, So, Z, Zl, Zp, Zs,
- Arabic, Armenian, Bengali, Bopomofo, Braille, Buginese,
- Buhid, Canadian_Aboriginal, Cherokee, Common, Coptic,
- Cypriot, Cyrillic, Deseret, Devanagari, Ethiopic, Georgian,
- Glagolitic, Gothic, Greek, Gujarati, Gurmukhi, Han, Hangul,
- Hanunoo, Hebrew, Hiragana, Inherited, Kannada, Katakana,
- Kharoshthi, Khmer, Lao, Latin, Limbu, Linear_B, Malayalam,
- Mongolian, Myanmar, New_Tai_Lue, Ogham, Old_Italic, Old_Persian,
- Oriya, Osmanya, Runic, Shavian, Sinhala, Syloti_Nagri, Syriac,
- Tagalog, Tagbanwa, Tai_Le, Tamil, Telugu, Thaana, Thai, Tibetan,
- Tifinagh, Ugaritic, Yi
+ See doc/UNICODE_PROPERTIES.
-µ´¼Ö Àµµ¬É½¸½ Version 5.9.1 2007/09/05
+µ´¼Ö Àµµ¬É½¸½ Version 6.0.0 2016/05/02
»ÈÍÑʸˡ: ONIG_SYNTAX_RUBY (´ûÄêÃÍ)
Hiragana, Katakana
+ UTF8, UTF16, UTF32¤Ç͸ú
- Any, Assigned, C, Cc, Cf, Cn, Co, Cs, L, Ll, Lm, Lo, Lt, Lu,
- M, Mc, Me, Mn, N, Nd, Nl, No, P, Pc, Pd, Pe, Pf, Pi, Po, Ps,
- S, Sc, Sk, Sm, So, Z, Zl, Zp, Zs,
- Arabic, Armenian, Bengali, Bopomofo, Braille, Buginese,
- Buhid, Canadian_Aboriginal, Cherokee, Common, Coptic,
- Cypriot, Cyrillic, Deseret, Devanagari, Ethiopic, Georgian,
- Glagolitic, Gothic, Greek, Gujarati, Gurmukhi, Han, Hangul,
- Hanunoo, Hebrew, Hiragana, Inherited, Kannada, Katakana,
- Kharoshthi, Khmer, Lao, Latin, Limbu, Linear_B, Malayalam,
- Mongolian, Myanmar, New_Tai_Lue, Ogham, Old_Italic, Old_Persian,
- Oriya, Osmanya, Runic, Shavian, Sinhala, Syloti_Nagri, Syriac,
- Tagalog, Tagbanwa, Tai_Le, Tamil, Telugu, Thaana, Thai, Tibetan,
- Tifinagh, Ugaritic, Yi
+ doc/UNICODE_PROPERTIES»²¾È
--- /dev/null
+Unicode Properties (from Unicode Version: 7.0.0)
+
+ 1: Any
+ 2: Assigned
+ 3: C
+ 4: Cc
+ 5: Cf
+ 6: Cn
+ 7: Co
+ 8: Cs
+ 9: L
+ 10: LC
+ 11: Ll
+ 12: Lm
+ 13: Lo
+ 14: Lt
+ 15: Lu
+ 16: M
+ 17: Mc
+ 18: Me
+ 19: Mn
+ 20: N
+ 21: Nd
+ 22: Nl
+ 23: No
+ 24: P
+ 25: Pc
+ 26: Pd
+ 27: Pe
+ 28: Pf
+ 29: Pi
+ 30: Po
+ 31: Ps
+ 32: S
+ 33: Sc
+ 34: Sk
+ 35: Sm
+ 36: So
+ 37: Z
+ 38: Zl
+ 39: Zp
+ 40: Zs
+ 41: Math
+ 42: Alphabetic
+ 43: Lowercase
+ 44: Uppercase
+ 45: Cased
+ 46: Case_Ignorable
+ 47: Changes_When_Lowercased
+ 48: Changes_When_Uppercased
+ 49: Changes_When_Titlecased
+ 50: Changes_When_Casefolded
+ 51: Changes_When_Casemapped
+ 52: ID_Start
+ 53: ID_Continue
+ 54: XID_Start
+ 55: XID_Continue
+ 56: Default_Ignorable_Code_Point
+ 57: Grapheme_Extend
+ 58: Grapheme_Base
+ 59: Grapheme_Link
+ 60: Common
+ 61: Latin
+ 62: Greek
+ 63: Cyrillic
+ 64: Armenian
+ 65: Hebrew
+ 66: Arabic
+ 67: Syriac
+ 68: Thaana
+ 69: Devanagari
+ 70: Bengali
+ 71: Gurmukhi
+ 72: Gujarati
+ 73: Oriya
+ 74: Tamil
+ 75: Telugu
+ 76: Kannada
+ 77: Malayalam
+ 78: Sinhala
+ 79: Thai
+ 80: Lao
+ 81: Tibetan
+ 82: Myanmar
+ 83: Georgian
+ 84: Hangul
+ 85: Ethiopic
+ 86: Cherokee
+ 87: Canadian_Aboriginal
+ 88: Ogham
+ 89: Runic
+ 90: Khmer
+ 91: Mongolian
+ 92: Hiragana
+ 93: Katakana
+ 94: Bopomofo
+ 95: Han
+ 96: Yi
+ 97: Old_Italic
+ 98: Gothic
+ 99: Deseret
+100: Inherited
+101: Tagalog
+102: Hanunoo
+103: Buhid
+104: Tagbanwa
+105: Limbu
+106: Tai_Le
+107: Linear_B
+108: Ugaritic
+109: Shavian
+110: Osmanya
+111: Cypriot
+112: Braille
+113: Buginese
+114: Coptic
+115: New_Tai_Lue
+116: Glagolitic
+117: Tifinagh
+118: Syloti_Nagri
+119: Old_Persian
+120: Kharoshthi
+121: Balinese
+122: Cuneiform
+123: Phoenician
+124: Phags_Pa
+125: Nko
+126: Sundanese
+127: Lepcha
+128: Ol_Chiki
+129: Vai
+130: Saurashtra
+131: Kayah_Li
+132: Rejang
+133: Lycian
+134: Carian
+135: Lydian
+136: Cham
+137: Tai_Tham
+138: Tai_Viet
+139: Avestan
+140: Egyptian_Hieroglyphs
+141: Samaritan
+142: Lisu
+143: Bamum
+144: Javanese
+145: Meetei_Mayek
+146: Imperial_Aramaic
+147: Old_South_Arabian
+148: Inscriptional_Parthian
+149: Inscriptional_Pahlavi
+150: Old_Turkic
+151: Kaithi
+152: Batak
+153: Brahmi
+154: Mandaic
+155: Chakma
+156: Meroitic_Cursive
+157: Meroitic_Hieroglyphs
+158: Miao
+159: Sharada
+160: Sora_Sompeng
+161: Takri
+162: Caucasian_Albanian
+163: Bassa_Vah
+164: Duployan
+165: Elbasan
+166: Grantha
+167: Pahawh_Hmong
+168: Khojki
+169: Linear_A
+170: Mahajani
+171: Manichaean
+172: Mende_Kikakui
+173: Modi
+174: Mro
+175: Old_North_Arabian
+176: Nabataean
+177: Palmyrene
+178: Pau_Cin_Hau
+179: Old_Permic
+180: Psalter_Pahlavi
+181: Siddham
+182: Khudawadi
+183: Tirhuta
+184: Warang_Citi
+185: White_Space
+186: Bidi_Control
+187: Join_Control
+188: Dash
+189: Hyphen
+190: Quotation_Mark
+191: Terminal_Punctuation
+192: Other_Math
+193: Hex_Digit
+194: ASCII_Hex_Digit
+195: Other_Alphabetic
+196: Ideographic
+197: Diacritic
+198: Extender
+199: Other_Lowercase
+200: Other_Uppercase
+201: Noncharacter_Code_Point
+202: Other_Grapheme_Extend
+203: IDS_Binary_Operator
+204: IDS_Trinary_Operator
+205: Radical
+206: Unified_Ideograph
+207: Other_Default_Ignorable_Code_Point
+208: Deprecated
+209: Soft_Dotted
+210: Logical_Order_Exception
+211: Other_ID_Start
+212: Other_ID_Continue
+213: STerm
+214: Variation_Selector
+215: Pattern_White_Space
+216: Pattern_Syntax
+217: Unknown
+218: Aghb
+219: AHex
+220: Arab
+221: Armi
+222: Armn
+223: Avst
+224: Bali
+225: Bamu
+226: Bass
+227: Batk
+228: Beng
+229: Bidi_C
+230: Bopo
+231: Brah
+232: Brai
+233: Bugi
+234: Buhd
+235: Cakm
+236: Cans
+237: Cari
+238: Cased_Letter
+239: Cher
+240: CI
+241: Close_Punctuation
+242: Combining_Mark
+243: Connector_Punctuation
+244: Control
+245: Copt
+246: Cprt
+247: Currency_Symbol
+248: CWCF
+249: CWCM
+250: CWL
+251: CWT
+252: CWU
+253: Cyrl
+254: Dash_Punctuation
+255: Decimal_Number
+256: Dep
+257: Deva
+258: DI
+259: Dia
+260: Dsrt
+261: Dupl
+262: Egyp
+263: Elba
+264: Enclosing_Mark
+265: Ethi
+266: Ext
+267: Final_Punctuation
+268: Format
+269: Geor
+270: Glag
+271: Goth
+272: Gran
+273: Gr_Base
+274: Grek
+275: Gr_Ext
+276: Gr_Link
+277: Gujr
+278: Guru
+279: Hang
+280: Hani
+281: Hano
+282: Hebr
+283: Hex
+284: Hira
+285: Hmng
+286: IDC
+287: Ideo
+288: IDS
+289: IDSB
+290: IDST
+291: Initial_Punctuation
+292: Ital
+293: Java
+294: Join_C
+295: Kali
+296: Kana
+297: Khar
+298: Khmr
+299: Khoj
+300: Knda
+301: Kthi
+302: Lana
+303: Laoo
+304: Latn
+305: Lepc
+306: Letter
+307: Letter_Number
+308: Limb
+309: Lina
+310: Linb
+311: Line_Separator
+312: LOE
+313: Lowercase_Letter
+314: Lyci
+315: Lydi
+316: Mahj
+317: Mand
+318: Mani
+319: Mark
+320: Math_Symbol
+321: Mend
+322: Merc
+323: Mero
+324: Mlym
+325: Modifier_Letter
+326: Modifier_Symbol
+327: Mong
+328: Mroo
+329: Mtei
+330: Mymr
+331: Narb
+332: Nbat
+333: NChar
+334: Nkoo
+335: Nonspacing_Mark
+336: Number
+337: OAlpha
+338: ODI
+339: Ogam
+340: OGr_Ext
+341: OIDC
+342: OIDS
+343: Olck
+344: OLower
+345: OMath
+346: Open_Punctuation
+347: Orkh
+348: Orya
+349: Osma
+350: Other
+351: Other_Letter
+352: Other_Number
+353: Other_Punctuation
+354: Other_Symbol
+355: OUpper
+356: Palm
+357: Paragraph_Separator
+358: Pat_Syn
+359: Pat_WS
+360: Pauc
+361: Perm
+362: Phag
+363: Phli
+364: Phlp
+365: Phnx
+366: Plrd
+367: Private_Use
+368: Prti
+369: Punctuation
+370: Qaac
+371: Qaai
+372: QMark
+373: Rjng
+374: Runr
+375: Samr
+376: Sarb
+377: Saur
+378: SD
+379: Separator
+380: Shaw
+381: Shrd
+382: Sidd
+383: Sind
+384: Sinh
+385: Sora
+386: Space_Separator
+387: Spacing_Mark
+388: Sund
+389: Surrogate
+390: Sylo
+391: Symbol
+392: Syrc
+393: Tagb
+394: Takr
+395: Tale
+396: Talu
+397: Taml
+398: Tavt
+399: Telu
+400: Term
+401: Tfng
+402: Tglg
+403: Thaa
+404: Tibt
+405: Tirh
+406: Titlecase_Letter
+407: Ugar
+408: UIdeo
+409: Unassigned
+410: Uppercase_Letter
+411: Vaii
+412: VS
+413: Wara
+414: WSpace
+415: XIDC
+416: XIDS
+417: Xpeo
+418: Xsux
+419: Yiii
+420: Zinh
+421: Zyyy
+422: Zzzz
+423: In_Basic_Latin
+424: In_Latin_1_Supplement
+425: In_Latin_Extended_A
+426: In_Latin_Extended_B
+427: In_IPA_Extensions
+428: In_Spacing_Modifier_Letters
+429: In_Combining_Diacritical_Marks
+430: In_Greek_and_Coptic
+431: In_Cyrillic
+432: In_Cyrillic_Supplement
+433: In_Armenian
+434: In_Hebrew
+435: In_Arabic
+436: In_Syriac
+437: In_Arabic_Supplement
+438: In_Thaana
+439: In_NKo
+440: In_Samaritan
+441: In_Mandaic
+442: In_Arabic_Extended_A
+443: In_Devanagari
+444: In_Bengali
+445: In_Gurmukhi
+446: In_Gujarati
+447: In_Oriya
+448: In_Tamil
+449: In_Telugu
+450: In_Kannada
+451: In_Malayalam
+452: In_Sinhala
+453: In_Thai
+454: In_Lao
+455: In_Tibetan
+456: In_Myanmar
+457: In_Georgian
+458: In_Hangul_Jamo
+459: In_Ethiopic
+460: In_Ethiopic_Supplement
+461: In_Cherokee
+462: In_Unified_Canadian_Aboriginal_Syllabics
+463: In_Ogham
+464: In_Runic
+465: In_Tagalog
+466: In_Hanunoo
+467: In_Buhid
+468: In_Tagbanwa
+469: In_Khmer
+470: In_Mongolian
+471: In_Unified_Canadian_Aboriginal_Syllabics_Extended
+472: In_Limbu
+473: In_Tai_Le
+474: In_New_Tai_Lue
+475: In_Khmer_Symbols
+476: In_Buginese
+477: In_Tai_Tham
+478: In_Combining_Diacritical_Marks_Extended
+479: In_Balinese
+480: In_Sundanese
+481: In_Batak
+482: In_Lepcha
+483: In_Ol_Chiki
+484: In_Sundanese_Supplement
+485: In_Vedic_Extensions
+486: In_Phonetic_Extensions
+487: In_Phonetic_Extensions_Supplement
+488: In_Combining_Diacritical_Marks_Supplement
+489: In_Latin_Extended_Additional
+490: In_Greek_Extended
+491: In_General_Punctuation
+492: In_Superscripts_and_Subscripts
+493: In_Currency_Symbols
+494: In_Combining_Diacritical_Marks_for_Symbols
+495: In_Letterlike_Symbols
+496: In_Number_Forms
+497: In_Arrows
+498: In_Mathematical_Operators
+499: In_Miscellaneous_Technical
+500: In_Control_Pictures
+501: In_Optical_Character_Recognition
+502: In_Enclosed_Alphanumerics
+503: In_Box_Drawing
+504: In_Block_Elements
+505: In_Geometric_Shapes
+506: In_Miscellaneous_Symbols
+507: In_Dingbats
+508: In_Miscellaneous_Mathematical_Symbols_A
+509: In_Supplemental_Arrows_A
+510: In_Braille_Patterns
+511: In_Supplemental_Arrows_B
+512: In_Miscellaneous_Mathematical_Symbols_B
+513: In_Supplemental_Mathematical_Operators
+514: In_Miscellaneous_Symbols_and_Arrows
+515: In_Glagolitic
+516: In_Latin_Extended_C
+517: In_Coptic
+518: In_Georgian_Supplement
+519: In_Tifinagh
+520: In_Ethiopic_Extended
+521: In_Cyrillic_Extended_A
+522: In_Supplemental_Punctuation
+523: In_CJK_Radicals_Supplement
+524: In_Kangxi_Radicals
+525: In_Ideographic_Description_Characters
+526: In_CJK_Symbols_and_Punctuation
+527: In_Hiragana
+528: In_Katakana
+529: In_Bopomofo
+530: In_Hangul_Compatibility_Jamo
+531: In_Kanbun
+532: In_Bopomofo_Extended
+533: In_CJK_Strokes
+534: In_Katakana_Phonetic_Extensions
+535: In_Enclosed_CJK_Letters_and_Months
+536: In_CJK_Compatibility
+537: In_CJK_Unified_Ideographs_Extension_A
+538: In_Yijing_Hexagram_Symbols
+539: In_CJK_Unified_Ideographs
+540: In_Yi_Syllables
+541: In_Yi_Radicals
+542: In_Lisu
+543: In_Vai
+544: In_Cyrillic_Extended_B
+545: In_Bamum
+546: In_Modifier_Tone_Letters
+547: In_Latin_Extended_D
+548: In_Syloti_Nagri
+549: In_Common_Indic_Number_Forms
+550: In_Phags_pa
+551: In_Saurashtra
+552: In_Devanagari_Extended
+553: In_Kayah_Li
+554: In_Rejang
+555: In_Hangul_Jamo_Extended_A
+556: In_Javanese
+557: In_Myanmar_Extended_B
+558: In_Cham
+559: In_Myanmar_Extended_A
+560: In_Tai_Viet
+561: In_Meetei_Mayek_Extensions
+562: In_Ethiopic_Extended_A
+563: In_Latin_Extended_E
+564: In_Meetei_Mayek
+565: In_Hangul_Syllables
+566: In_Hangul_Jamo_Extended_B
+567: In_High_Surrogates
+568: In_High_Private_Use_Surrogates
+569: In_Low_Surrogates
+570: In_Private_Use_Area
+571: In_CJK_Compatibility_Ideographs
+572: In_Alphabetic_Presentation_Forms
+573: In_Arabic_Presentation_Forms_A
+574: In_Variation_Selectors
+575: In_Vertical_Forms
+576: In_Combining_Half_Marks
+577: In_CJK_Compatibility_Forms
+578: In_Small_Form_Variants
+579: In_Arabic_Presentation_Forms_B
+580: In_Halfwidth_and_Fullwidth_Forms
+581: In_Specials
+582: In_Linear_B_Syllabary
+583: In_Linear_B_Ideograms
+584: In_Aegean_Numbers
+585: In_Ancient_Greek_Numbers
+586: In_Ancient_Symbols
+587: In_Phaistos_Disc
+588: In_Lycian
+589: In_Carian
+590: In_Coptic_Epact_Numbers
+591: In_Old_Italic
+592: In_Gothic
+593: In_Old_Permic
+594: In_Ugaritic
+595: In_Old_Persian
+596: In_Deseret
+597: In_Shavian
+598: In_Osmanya
+599: In_Elbasan
+600: In_Caucasian_Albanian
+601: In_Linear_A
+602: In_Cypriot_Syllabary
+603: In_Imperial_Aramaic
+604: In_Palmyrene
+605: In_Nabataean
+606: In_Phoenician
+607: In_Lydian
+608: In_Meroitic_Hieroglyphs
+609: In_Meroitic_Cursive
+610: In_Kharoshthi
+611: In_Old_South_Arabian
+612: In_Old_North_Arabian
+613: In_Manichaean
+614: In_Avestan
+615: In_Inscriptional_Parthian
+616: In_Inscriptional_Pahlavi
+617: In_Psalter_Pahlavi
+618: In_Old_Turkic
+619: In_Rumi_Numeral_Symbols
+620: In_Brahmi
+621: In_Kaithi
+622: In_Sora_Sompeng
+623: In_Chakma
+624: In_Mahajani
+625: In_Sharada
+626: In_Sinhala_Archaic_Numbers
+627: In_Khojki
+628: In_Khudawadi
+629: In_Grantha
+630: In_Tirhuta
+631: In_Siddham
+632: In_Modi
+633: In_Takri
+634: In_Warang_Citi
+635: In_Pau_Cin_Hau
+636: In_Cuneiform
+637: In_Cuneiform_Numbers_and_Punctuation
+638: In_Egyptian_Hieroglyphs
+639: In_Bamum_Supplement
+640: In_Mro
+641: In_Bassa_Vah
+642: In_Pahawh_Hmong
+643: In_Miao
+644: In_Kana_Supplement
+645: In_Duployan
+646: In_Shorthand_Format_Controls
+647: In_Byzantine_Musical_Symbols
+648: In_Musical_Symbols
+649: In_Ancient_Greek_Musical_Notation
+650: In_Tai_Xuan_Jing_Symbols
+651: In_Counting_Rod_Numerals
+652: In_Mathematical_Alphanumeric_Symbols
+653: In_Mende_Kikakui
+654: In_Arabic_Mathematical_Alphabetic_Symbols
+655: In_Mahjong_Tiles
+656: In_Domino_Tiles
+657: In_Playing_Cards
+658: In_Enclosed_Alphanumeric_Supplement
+659: In_Enclosed_Ideographic_Supplement
+660: In_Miscellaneous_Symbols_and_Pictographs
+661: In_Emoticons
+662: In_Ornamental_Dingbats
+663: In_Transport_and_Map_Symbols
+664: In_Alchemical_Symbols
+665: In_Geometric_Shapes_Extended
+666: In_Supplemental_Arrows_C
+667: In_CJK_Unified_Ideographs_Extension_B
+668: In_CJK_Unified_Ideographs_Extension_C
+669: In_CJK_Unified_Ideographs_Extension_D
+670: In_CJK_Compatibility_Ideographs_Supplement
+671: In_Tags
+672: In_Variation_Selectors_Supplement
+673: In_Supplementary_Private_Use_Area_A
+674: In_Supplementary_Private_Use_Area_B
+675: In_No_Block
CaseFolding.txt
unicode_fold?_key.gperf
unicode_unfold_key.gperf
+UNICODE_PROPERTIES
*.o
*.so
*.lo
if n > PROPERTY_NAME_MAX_LEN:
PROPERTY_NAME_MAX_LEN = n
+LIST_COUNTER = 1
+def entry_prop_name(name, index):
+ global LIST_COUNTER
+ set_max_prop_name(name)
+ if OUTPUT_LIST and index >= len(POSIX_LIST):
+ print >> UPF, "%3d: %s" % (LIST_COUNTER, name)
+ LIST_COUNTER += 1
+
+
### main ###
argv = sys.argv
argc = len(argv)
if argv[1] == '-posix':
POSIX_ONLY = True
+OUTPUT_LIST = not(POSIX_ONLY)
+
with open('UnicodeData.txt', 'r') as f:
dic, assigned = parse_unicode_data_file(f)
DIC = dic
'''
sys.stdout.write(s)
+if OUTPUT_LIST:
+ UPF = open("UNICODE_PROPERTIES", "w")
+ if VERSION_INFO is not None:
+ print >> UPF, "Unicode Properties (from Unicode Version: %s)" % VERSION_INFO
+ print >> UPF, ''
+
index = -1
for prop in POSIX_LIST:
index += 1
- set_max_prop_name(prop)
+ entry_prop_name(prop, index)
prop = normalize_prop_name(prop)
print_prop_and_index(prop, index)
if not(POSIX_ONLY):
for prop in PROPS:
index += 1
- set_max_prop_name(prop)
+ entry_prop_name(prop, index)
prop = normalize_prop_name(prop)
print_prop_and_index(prop, index)
#print >> sys.stderr, "ALIASES: value is not exist: %s => %s" % (k, v)
continue
- set_max_prop_name(k)
+ entry_prop_name(k, index)
print_prop_and_index(nk, index)
for name in BLOCKS:
index += 1
- set_max_prop_name(name)
+ entry_prop_name(name, index)
name = normalize_prop_name(name)
print_prop_and_index(name, index)
print "#define PROPERTY_NAME_MAX_SIZE %d" % (PROPERTY_NAME_MAX_LEN + 10)
print "#define CODE_RANGES_NUM %d" % (index + 1)
+if OUTPUT_LIST:
+ UPF.close()
+
sys.exit(0)