From: Teodor Sigaev Date: Wed, 16 Mar 2016 13:47:03 +0000 (+0300) Subject: Improve script generating unaccent rules X-Git-Tag: REL9_6_BETA1~480 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9a206d063c410df7cd5da01b169b23bff413fef5;p=postgresql Improve script generating unaccent rules Script now use the standard Unicode transliterator Latin-ASCII. Author: Leonard Benedetti --- diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index b838d8f630..2f5520c819 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -1,20 +1,33 @@ -#!/usr/bin/python +#!/usr/bin/python2 +# -*- coding: utf-8 -*- # # This script builds unaccent.rules on standard output when given the -# contents of UnicodeData.txt[1] on standard input. Optionally includes -# ligature expansion, if --expand-ligatures is given on the command line. +# contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as +# arguments. Optionally includes ligature expansion and Unicode CLDR +# Latin-ASCII transliterator, enabled by default, this can be disabled +# with "--no-ligatures-expansion" command line option. # # The approach is to use the Unicode decomposition data to identify # precomposed codepoints that are equivalent to a ligature of several # letters, or a base letter with any number of diacritical marks. -# There is also a small set of special cases for codepoints that we -# traditionally support even though Unicode doesn't consider them to -# be ligatures or letters with marks. # -# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt +# This approach handles most letters with diacritical marks and some +# ligatures. However, several characters (notably a majority of +# ligatures) don't have decomposition. To handle all these cases, one can +# use a standard Unicode transliterator available in Common Locale Data +# Repository (CLDR): Latin-ASCII. This transliterator associates Unicode +# characters to ASCII-range equivalent. Unless "--no-ligatures-expansion" +# option is enabled, the XML file of this transliterator [2] -- given as a +# command line argument -- will be parsed and used. +# +# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt +# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml + import re +import argparse import sys +import xml.etree.ElementTree as ET def print_record(codepoint, letter): print (unichr(codepoint) + "\t" + letter).encode("UTF-8") @@ -63,15 +76,73 @@ def get_plain_letters(codepoint, table): assert(is_ligature(codepoint, table)) return [get_plain_letter(table[id], table) for id in codepoint.combining_ids] -def main(expand_ligatures): +def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath): + """Parse the XML file and return a set of tuples (src, trg), where "src" + is the original character and "trg" the substitute.""" + charactersSet = set() + + # RegEx to parse rules + rulePattern = re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;') + + # construct tree from XML + transliterationTree = ET.parse(latinAsciiFilePath) + transliterationTreeRoot = transliterationTree.getroot() + + for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"): + matches = rulePattern.search(rule.text) + + # The regular expression capture four groups corresponding + # to the characters. + # + # Group 1: plain "src" char. Empty if group 2 is not. + # Group 2: unicode-espaced "src" char (e.g. "\u0110"). Empty if group 1 is not. + # + # Group 3: plain "trg" char. Empty if group 4 is not. + # Group 4: plain "trg" char between quotes. Empty if group 3 is not. + if matches is not None: + src = matches.group(1) if matches.group(1) is not None else matches.group(2).decode('unicode-escape') + trg = matches.group(3) if matches.group(3) is not None else matches.group(4) + + # "'" and """ are escaped + trg = trg.replace("\\'", "'").replace('\\"', '"') + + # the parser of unaccent only accepts non-whitespace characters + # for "src" and "trg" (see unaccent.c) + if not src.isspace() and not trg.isspace(): + charactersSet.add((ord(src), trg)) + + return charactersSet + +def special_cases(): + """Returns the special cases which are not handled by other methods""" + charactersSet = set() + + # Cyrillic + charactersSet.add((0x0401, u"\u0415")) # CYRILLIC CAPITAL LETTER IO + charactersSet.add((0x0451, u"\u0435")) # CYRILLIC SMALL LETTER IO + + # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F) + charactersSet.add((0x2103, u"\xb0C")) # DEGREE CELSIUS + charactersSet.add((0x2109, u"\xb0F")) # DEGREE FAHRENHEIT + charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT + + return charactersSet + +def main(args): # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings decomposition_type_pattern = re.compile(" *<[^>]*> *") table = {} all = [] + # unordered set for ensure uniqueness + charactersSet = set() + + # read file UnicodeData.txt + unicodeDataFile = open(args.unicodeDataFilePath, 'r') + # read everything we need into memory - for line in sys.stdin.readlines(): + for line in unicodeDataFile: fields = line.split(";") if len(fields) > 5: # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt @@ -89,35 +160,34 @@ def main(expand_ligatures): if codepoint.general_category.startswith('L') and \ len(codepoint.combining_ids) > 1: if is_letter_with_marks(codepoint, table): - print_record(codepoint.id, - chr(get_plain_letter(codepoint, table).id)) - elif expand_ligatures and is_ligature(codepoint, table): - print_record(codepoint.id, + charactersSet.add((codepoint.id, + chr(get_plain_letter(codepoint, table).id))) + elif args.noLigaturesExpansion is False and is_ligature(codepoint, table): + charactersSet.add((codepoint.id, "".join(unichr(combining_codepoint.id) for combining_codepoint \ - in get_plain_letters(codepoint, table))) - - # some special cases - print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE - print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE - print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE - print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE - print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I - print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE - print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE - print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE - print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE - print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE - print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE - print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE - print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO - print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO - if expand_ligatures: - print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE - print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S - print_record(0x00e6, "ae") # LATIN SMALL LETTER AE - print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE - print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE + in get_plain_letters(codepoint, table)))) + + # add CLDR Latin-ASCII characters + if not args.noLigaturesExpansion: + charactersSet |= parse_cldr_latin_ascii_transliterator(args.latinAsciiFilePath) + charactersSet |= special_cases() + + # sort for more convenient display + charactersList = sorted(charactersSet, key=lambda characterPair: characterPair[0]) + + for characterPair in charactersList: + print_record(characterPair[0], characterPair[1]) if __name__ == "__main__": - main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures") + parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.') + parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt. See .", type=str, required=True, dest='unicodeDataFilePath') + parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See .", type=str, dest='latinAsciiFilePath') + parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion') + args = parser.parse_args() + + if args.noLigaturesExpansion is False and args.latinAsciiFilePath is None: + sys.stderr.write('You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.') + sys.exit(1) + + main(args) diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index 73c24a188b..84886da587 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -1,9 +1,18 @@ +© (C) +« << +­ - +® (R) +» >> +¼ 1/4 +½ 1/2 +¾ 3/4 À A Á A  A à A Ä A Å A +Æ AE Ç C È E É E @@ -13,23 +22,29 @@ Í I Î I Ï I +Ð D Ñ N Ò O Ó O Ô O Õ O Ö O +× * +Ø O Ù U Ú U Û U Ü U Ý Y +Þ TH +ß ss à a á a â a ã a ä a å a +æ ae ç c è e é e @@ -39,17 +54,21 @@ í i î i ï i +ð d ñ n ò o ó o ô o õ o ö o +÷ / +ø o ù u ú u û u ü u ý y +þ th ÿ y Ā A ā a @@ -67,6 +86,8 @@ č c Ď D ď d +Đ D +đ d Ē E ē e Ĕ E @@ -87,6 +108,8 @@ ģ g Ĥ H ĥ h +Ħ H +ħ h Ĩ I ĩ i Ī I @@ -96,30 +119,41 @@ Į I į i İ I +ı i IJ IJ ij ij Ĵ J ĵ j Ķ K ķ k +ĸ q Ĺ L ĺ l Ļ L ļ l Ľ L ľ l +Ŀ L +ŀ l +Ł L +ł l Ń N ń n Ņ N ņ n Ň N ň n +ʼn 'n +Ŋ N +ŋ n Ō O ō o Ŏ O ŏ o Ő O ő o +Œ OE +œ oe Ŕ R ŕ r Ŗ R @@ -138,6 +172,8 @@ ţ t Ť T ť t +Ŧ T +ŧ t Ũ U ũ u Ū U @@ -161,10 +197,46 @@ ż z Ž Z ž z +ſ s +ƀ b +Ɓ B +Ƃ B +ƃ b +Ƈ C +ƈ c +Ɖ D +Ɗ D +Ƌ D +ƌ d +Ɛ E +Ƒ F +ƒ f +Ɠ G +ƕ hv +Ɩ I +Ɨ I +Ƙ K +ƙ k +ƚ l +Ɲ N +ƞ n Ơ O ơ o +Ƣ OI +ƣ oi +Ƥ P +ƥ p +ƫ t +Ƭ T +ƭ t +Ʈ T Ư U ư u +Ʋ V +Ƴ Y +ƴ y +Ƶ Z +ƶ z DŽ DZ Dž Dz dž dz @@ -182,6 +254,8 @@ ǒ o Ǔ U ǔ u +Ǥ G +ǥ g Ǧ G ǧ g Ǩ K @@ -226,6 +300,9 @@ ț t Ȟ H ȟ h +ȡ d +Ȥ Z +ȥ z Ȧ A ȧ a Ȩ E @@ -234,6 +311,128 @@ ȯ o Ȳ Y ȳ y +ȴ l +ȵ n +ȶ t +ȷ j +ȸ db +ȹ qp +Ⱥ A +Ȼ C +ȼ c +Ƚ L +Ⱦ T +ȿ s +ɀ z +Ƀ B +Ʉ U +Ɇ E +ɇ e +Ɉ J +ɉ j +Ɍ R +ɍ r +Ɏ Y +ɏ y +ɓ b +ɕ c +ɖ d +ɗ d +ɛ e +ɟ j +ɠ g +ɡ g +ɢ G +ɦ h +ɧ h +ɨ i +ɪ I +ɫ l +ɬ l +ɭ l +ɱ m +ɲ n +ɳ n +ɴ N +ɶ OE +ɼ r +ɽ r +ɾ r +ʀ R +ʂ s +ʈ t +ʉ u +ʋ v +ʏ Y +ʐ z +ʑ z +ʙ B +ʛ G +ʜ H +ʝ j +ʟ L +ʠ q +ʣ dz +ʥ dz +ʦ ts +ʪ ls +ʫ lz +Ё Е +ё е +ᴀ A +ᴁ AE +ᴃ B +ᴄ C +ᴅ D +ᴆ D +ᴇ E +ᴊ J +ᴋ K +ᴌ L +ᴍ M +ᴏ O +ᴘ P +ᴛ T +ᴜ U +ᴠ V +ᴡ W +ᴢ Z +ᵫ ue +ᵬ b +ᵭ d +ᵮ f +ᵯ m +ᵰ n +ᵱ p +ᵲ r +ᵳ r +ᵴ s +ᵵ t +ᵶ z +ᵺ th +ᵻ I +ᵽ p +ᵾ U +ᶀ b +ᶁ d +ᶂ f +ᶃ g +ᶄ k +ᶅ l +ᶆ m +ᶇ n +ᶈ p +ᶉ r +ᶊ s +ᶌ v +ᶍ x +ᶎ z +ᶏ a +ᶑ d +ᶒ e +ᶓ e +ᶖ i +ᶙ u Ḁ A ḁ a Ḃ B @@ -356,6 +555,10 @@ ẗ t ẘ w ẙ y +ẚ a +ẜ s +ẝ s +ẞ SS Ạ A ạ a Ả A @@ -386,28 +589,461 @@ ỷ y Ỹ Y ỹ y +Ỻ LL +ỻ ll +Ỽ V +ỽ v +Ỿ Y +ỿ y +‐ - +‑ - +‒ - +– - +— - +― - +‖ || +‘ ' +’ ' +‚ , +‛ ' +“ " +” " +„ ,, +‟ " +․ . +‥ .. +… ... +′ ' +″ " +‹ < +› > +‼ !! +⁄ / +⁅ [ +⁆ ] +⁇ ?? +⁈ ?! +⁉ !? +⁎ * +₠ CE +₢ Cr +₣ Fr. +₤ L. +₧ Pts +₹ Rs +₺ TL +℀ a/c +℁ a/s +ℂ C +℃ °C +℅ c/o +℆ c/u +℉ °F +ℊ g +ℋ H +ℌ x +ℍ H +ℎ h +ℐ I +ℑ I +ℒ L +ℓ l +ℕ N +№ No +℗ (P) +ℙ P +ℚ Q +ℛ R +ℜ R +ℝ R +℞ Rx +℡ TEL +ℤ Z +ℨ Z +ℬ B +ℭ C +ℯ e +ℰ E +ℱ F +ℳ M +ℴ o +ℹ i +℻ FAX +ⅅ D +ⅆ d +ⅇ e +ⅈ i +ⅉ j +⅓ 1/3 +⅔ 2/3 +⅕ 1/5 +⅖ 2/5 +⅗ 3/5 +⅘ 4/5 +⅙ 1/6 +⅚ 5/6 +⅛ 1/8 +⅜ 3/8 +⅝ 5/8 +⅞ 7/8 +⅟ 1/ +Ⅰ I +Ⅱ II +Ⅲ III +Ⅳ IV +Ⅴ V +Ⅵ VI +Ⅶ VII +Ⅷ VIII +Ⅸ IX +Ⅹ X +Ⅺ XI +Ⅻ XII +Ⅼ L +Ⅽ C +Ⅾ D +Ⅿ M +ⅰ i +ⅱ ii +ⅲ iii +ⅳ iv +ⅴ v +ⅵ vi +ⅶ vii +ⅷ viii +ⅸ ix +ⅹ x +ⅺ xi +ⅻ xii +ⅼ l +ⅽ c +ⅾ d +ⅿ m +− - +∕ / +∖ \ +∣ | +∥ || +≪ << +≫ >> +⑴ (1) +⑵ (2) +⑶ (3) +⑷ (4) +⑸ (5) +⑹ (6) +⑺ (7) +⑻ (8) +⑼ (9) +⑽ (10) +⑾ (11) +⑿ (12) +⒀ (13) +⒁ (14) +⒂ (15) +⒃ (16) +⒄ (17) +⒅ (18) +⒆ (19) +⒇ (20) +⒈ 1. +⒉ 2. +⒊ 3. +⒋ 4. +⒌ 5. +⒍ 6. +⒎ 7. +⒏ 8. +⒐ 9. +⒑ 10. +⒒ 11. +⒓ 12. +⒔ 13. +⒕ 14. +⒖ 15. +⒗ 16. +⒘ 17. +⒙ 18. +⒚ 19. +⒛ 20. +⒜ (a) +⒝ (b) +⒞ (c) +⒟ (d) +⒠ (e) +⒡ (f) +⒢ (g) +⒣ (h) +⒤ (i) +⒥ (j) +⒦ (k) +⒧ (l) +⒨ (m) +⒩ (n) +⒪ (o) +⒫ (p) +⒬ (q) +⒭ (r) +⒮ (s) +⒯ (t) +⒰ (u) +⒱ (v) +⒲ (w) +⒳ (x) +⒴ (y) +⒵ (z) +⦅ (( +⦆ )) +⩴ ::= +⩵ == +⩶ === +、 , +。 . +〇 0 +〈 < +〉 > +《 << +》 >> +〔 [ +〕 ] +〘 [ +〙 ] +〚 [ +〛 ] +〝 " +〞 " +㍱ hPa +㍲ da +㍳ AU +㍴ bar +㍵ oV +㍶ pc +㍷ dm +㍺ IU +㎀ pA +㎁ nA +㎃ mA +㎄ kA +㎅ KB +㎆ MB +㎇ GB +㎈ cal +㎉ kcal +㎊ pF +㎋ nF +㎎ mg +㎏ kg +㎐ Hz +㎑ kHz +㎒ MHz +㎓ GHz +㎔ THz +㎙ fm +㎚ nm +㎜ mm +㎝ cm +㎞ km +㎧ m/s +㎩ Pa +㎪ kPa +㎫ MPa +㎬ GPa +㎭ rad +㎮ rad/s +㎰ ps +㎱ ns +㎳ ms +㎴ pV +㎵ nV +㎷ mV +㎸ kV +㎹ MV +㎺ pW +㎻ nW +㎽ mW +㎾ kW +㎿ MW +㏂ a.m. +㏃ Bq +㏄ cc +㏅ cd +㏆ C/kg +㏇ Co. +㏈ dB +㏉ Gy +㏊ ha +㏋ HP +㏌ in +㏍ KK +㏎ KM +㏏ kt +㏐ lm +㏑ ln +㏒ log +㏓ lx +㏔ mb +㏕ mil +㏖ mol +㏗ pH +㏘ p.m. +㏙ PPM +㏚ PR +㏛ sr +㏜ Sv +㏝ Wb +㏞ V/m +㏟ A/m ff ff fi fi fl fl ffi ffi ffl ffl +ſt st st st -Ø O -ø o -Đ D -đ d -ı i -Ħ H -ħ h -Ł L -ł l -ʼn 'n -Ŧ T -ŧ t -Ё Е -ё е -Æ AE -ß ss -æ ae -Œ OE -œ oe +︐ , +︑ , +︒ . +︓ : +︔ ; +︕ ! +︖ ? +︙ ... +︰ .. +︱ - +︲ - +︵ ( +︶ ) +︷ { +︸ } +︹ [ +︺ ] +︽ << +︾ >> +︿ < +﹀ > +﹇ [ +﹈ ] +﹐ , +﹑ , +﹒ . +﹔ ; +﹕ : +﹖ ? +﹗ ! +﹘ - +﹙ ( +﹚ ) +﹛ { +﹜ } +﹝ [ +﹞ ] +﹟ # +﹠ & +﹡ * +﹢ + +﹣ - +﹤ < +﹥ > +﹦ = +﹨ \ +﹩ $ +﹪ % +﹫ @ +! ! +" " +# # +$ $ +% % +& & +' ' +( ( +) ) +* * ++ + +, , +- - +. . +/ / +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +: : +; ; +< < += = +> > +? ? +@ @ +A A +B B +C C +D D +E E +F F +G G +H H +I I +J J +K K +L L +M M +N N +O O +P P +Q Q +R R +S S +T T +U U +V V +W W +X X +Y Y +Z Z +[ [ +\ \ +] ] +^ ^ +_ _ +` ` +a a +b b +c c +d d +e e +f f +g g +h h +i i +j j +k k +l l +m m +n n +o o +p p +q q +r r +s s +t t +u u +v v +w w +x x +y y +z z +{ { +| | +} } +~ ~ +⦅ (( +⦆ )) +。 . +、 ,