Improve script generating unaccent rules

author Teodor Sigaev <teodor@sigaev.ru>

Wed, 16 Mar 2016 13:47:03 +0000 (16:47 +0300)

committer Teodor Sigaev <teodor@sigaev.ru>

Wed, 16 Mar 2016 13:47:03 +0000 (16:47 +0300)
author Teodor Sigaev <teodor@sigaev.ru>
Wed, 16 Mar 2016 13:47:03 +0000 (16:47 +0300)
committer Teodor Sigaev <teodor@sigaev.ru>
Wed, 16 Mar 2016 13:47:03 +0000 (16:47 +0300)
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py

index b838d8f630d82b35178c651619946591f5ae6089..2f5520c81981597fcc70d7cc8d9b2dc013ab9310 100644 (file)
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -1,20 +1,33 @@
-#!/usr/bin/python
+#!/usr/bin/python2
+# -*- coding: utf-8 -*-
  #
  # This script builds unaccent.rules on standard output when given the
-# contents of UnicodeData.txt[1] on standard input.  Optionally includes
-# ligature expansion, if --expand-ligatures is given on the command line.
+# contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as
+# arguments. Optionally includes ligature expansion and Unicode CLDR
+# Latin-ASCII transliterator, enabled by default, this can be disabled
+# with "--no-ligatures-expansion" command line option.
  #
  # The approach is to use the Unicode decomposition data to identify
  # precomposed codepoints that are equivalent to a ligature of several
  # letters, or a base letter with any number of diacritical marks.
-# There is also a small set of special cases for codepoints that we
-# traditionally support even though Unicode doesn't consider them to
-# be ligatures or letters with marks.
  #
-# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
+# This approach handles most letters with diacritical marks and some
+# ligatures.  However, several characters (notably a majority of
+# ligatures) don't have decomposition. To handle all these cases, one can
+# use a standard Unicode transliterator available in Common Locale Data
+# Repository (CLDR): Latin-ASCII.  This transliterator associates Unicode
+# characters to ASCII-range equivalent.  Unless "--no-ligatures-expansion"
+# option is enabled, the XML file of this transliterator [2] -- given as a
+# command line argument -- will be parsed and used.
+#
+# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
+# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
+
  
  import re
+import argparse
  import sys
+import xml.etree.ElementTree as ET
  
  def print_record(codepoint, letter):
      print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
@@ -63,15 +76,73 @@ def get_plain_letters(codepoint, table):
      assert(is_ligature(codepoint, table))
      return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
  
-def main(expand_ligatures):
+def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
+    """Parse the XML file and return a set of tuples (src, trg), where "src"
+    is the original character and "trg" the substitute."""
+    charactersSet = set()
+
+    # RegEx to parse rules
+    rulePattern = re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
+
+    # construct tree from XML
+    transliterationTree = ET.parse(latinAsciiFilePath)
+    transliterationTreeRoot = transliterationTree.getroot()
+
+    for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"):
+        matches = rulePattern.search(rule.text)
+
+        # The regular expression capture four groups corresponding
+        # to the characters.
+        #
+        # Group 1: plain "src" char. Empty if group 2 is not.
+        # Group 2: unicode-espaced "src" char (e.g. "\u0110"). Empty if group 1 is not.
+        #
+        # Group 3: plain "trg" char. Empty if group 4 is not.
+        # Group 4: plain "trg" char between quotes. Empty if group 3 is not.
+        if matches is not None:
+            src = matches.group(1) if matches.group(1) is not None else matches.group(2).decode('unicode-escape')
+            trg = matches.group(3) if matches.group(3) is not None else matches.group(4)
+
+            # "'" and """ are escaped
+            trg = trg.replace("\\'", "'").replace('\\"', '"')
+
+            # the parser of unaccent only accepts non-whitespace characters
+            # for "src" and "trg" (see unaccent.c)
+            if not src.isspace() and not trg.isspace():
+                charactersSet.add((ord(src), trg))
+
+    return charactersSet
+
+def special_cases():
+    """Returns the special cases which are not handled by other methods"""
+    charactersSet = set()
+
+    # Cyrillic
+    charactersSet.add((0x0401, u"\u0415")) # CYRILLIC CAPITAL LETTER IO
+    charactersSet.add((0x0451, u"\u0435")) # CYRILLIC SMALL LETTER IO
+
+    # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
+    charactersSet.add((0x2103, u"\xb0C")) # DEGREE CELSIUS
+    charactersSet.add((0x2109, u"\xb0F")) # DEGREE FAHRENHEIT
+    charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT
+
+    return charactersSet
+
+def main(args):
      # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
      decomposition_type_pattern = re.compile(" *<[^>]*> *")
  
      table = {}
      all = []
  
+    # unordered set for ensure uniqueness
+    charactersSet = set()
+
+    # read file UnicodeData.txt
+    unicodeDataFile = open(args.unicodeDataFilePath, 'r')
+
      # read everything we need into memory
-    for line in sys.stdin.readlines():
+    for line in unicodeDataFile:
          fields = line.split(";")
          if len(fields) > 5:
              # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
@@ -89,35 +160,34 @@ def main(expand_ligatures):
          if codepoint.general_category.startswith('L') and \
             len(codepoint.combining_ids) > 1:
              if is_letter_with_marks(codepoint, table):
-                print_record(codepoint.id,
-                             chr(get_plain_letter(codepoint, table).id))
-            elif expand_ligatures and is_ligature(codepoint, table):
-                print_record(codepoint.id,
+                charactersSet.add((codepoint.id,
+                             chr(get_plain_letter(codepoint, table).id)))
+            elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
+                charactersSet.add((codepoint.id,
                               "".join(unichr(combining_codepoint.id)
                                       for combining_codepoint \
-                                     in get_plain_letters(codepoint, table)))
-
-    # some special cases
-    print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
-    print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
-    print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
-    print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
-    print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
-    print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
-    print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
-    print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
-    print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
-    print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
-    print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
-    print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
-    print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
-    print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
-    if expand_ligatures:
-        print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
-        print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
-        print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
-        print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
-        print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
+                                     in get_plain_letters(codepoint, table))))
+
+    # add CLDR Latin-ASCII characters
+    if not args.noLigaturesExpansion:
+        charactersSet |= parse_cldr_latin_ascii_transliterator(args.latinAsciiFilePath)
+        charactersSet |= special_cases()
+
+    # sort for more convenient display
+    charactersList = sorted(charactersSet, key=lambda characterPair: characterPair[0])
+
+    for characterPair in charactersList:
+        print_record(characterPair[0], characterPair[1])
  
  if __name__ == "__main__":
-    main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")
+    parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
+    parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>.", type=str, required=True, dest='unicodeDataFilePath')
+    parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>.", type=str, dest='latinAsciiFilePath')
+    parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion')
+    args = parser.parse_args()
+
+    if args.noLigaturesExpansion is False and args.latinAsciiFilePath is None:
+        sys.stderr.write('You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.')
+        sys.exit(1)
+
+    main(args)
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules

index 73c24a188badf9dfcbf5ab3950841232014e3c22..84886da587aa8a0b4fd8e58fbc43a727f526b9b2 100644 (file)
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -1,9 +1,18 @@
+©     (C)
+«     <<
+     -
+®     (R)
+»     >>
+¼      1/4
+½      1/2
+¾      3/4
  À     A
  Á     A
  Â     A
  Ã     A
  Ä     A
  Å     A
+Æ     AE
  Ç     C
  È     E
  É     E
@@ -13,23 +22,29 @@
  Í     I
  Î     I
  Ï     I
+Ð     D
  Ñ     N
  Ò     O
  Ó     O
  Ô     O
  Õ     O
  Ö     O
+×     *
+Ø     O
  Ù     U
  Ú     U
  Û     U
  Ü     U
  Ý     Y
+Þ     TH
+ß     ss
  à     a
  á     a
  â     a
  ã     a
  ä     a
  å     a
+æ     ae
  ç     c
  è     e
  é     e
@@ -39,17 +54,21 @@
  í     i
  î     i
  ï     i
+ð     d
  ñ     n
  ò     o
  ó     o
  ô     o
  õ     o
  ö     o
+÷     /
+ø     o
  ù     u
  ú     u
  û     u
  ü     u
  ý     y
+þ     th
  ÿ     y
  Ā     A
  ā     a
@@ -67,6 +86,8 @@
  č     c
  Ď     D
  ď     d
+Đ     D
+đ     d
  Ē     E
  ē     e
  Ĕ     E
@@ -87,6 +108,8 @@
  ģ     g
  Ĥ     H
  ĥ     h
+Ħ     H
+ħ     h
  Ĩ     I
  ĩ     i
  Ī     I
@@ -96,30 +119,41 @@
  Į     I
  į     i
  İ     I
+ı     i
  Ĳ     IJ
  ĳ     ij
  Ĵ     J
  ĵ     j
  Ķ     K
  ķ     k
+ĸ     q
  Ĺ     L
  ĺ     l
  Ļ     L
  ļ     l
  Ľ     L
  ľ     l
+Ŀ     L
+ŀ     l
+Ł     L
+ł     l
  Ń     N
  ń     n
  Ņ     N
  ņ     n
  Ň     N
  ň     n
+ŉ     'n
+Ŋ     N
+ŋ     n
  Ō     O
  ō     o
  Ŏ     O
  ŏ     o
  Ő     O
  ő     o
+Œ     OE
+œ     oe
  Ŕ     R
  ŕ     r
  Ŗ     R
@@ -138,6 +172,8 @@
  ţ     t
  Ť     T
  ť     t
+Ŧ     T
+ŧ     t
  Ũ     U
  ũ     u
  Ū     U
@@ -161,10 +197,46 @@
  ż     z
  Ž     Z
  ž     z
+ſ     s
+ƀ     b
+Ɓ     B
+Ƃ     B
+ƃ     b
+Ƈ     C
+ƈ     c
+Ɖ     D
+Ɗ     D
+Ƌ     D
+ƌ     d
+Ɛ     E
+Ƒ     F
+ƒ     f
+Ɠ     G
+ƕ     hv
+Ɩ     I
+Ɨ     I
+Ƙ     K
+ƙ     k
+ƚ     l
+Ɲ     N
+ƞ     n
  Ơ     O
  ơ     o
+Ƣ     OI
+ƣ     oi
+Ƥ     P
+ƥ     p
+ƫ     t
+Ƭ     T
+ƭ     t
+Ʈ     T
  Ư     U
  ư     u
+Ʋ     V
+Ƴ     Y
+ƴ     y
+Ƶ     Z
+ƶ     z
  Ǆ     DZ
  ǅ     Dz
  ǆ     dz
@@ -182,6 +254,8 @@
  ǒ     o
  Ǔ     U
  ǔ     u
+Ǥ     G
+ǥ     g
  Ǧ     G
  ǧ     g
  Ǩ     K
@@ -226,6 +300,9 @@
  ț     t
  Ȟ     H
  ȟ     h
+ȡ     d
+Ȥ     Z
+ȥ     z
  Ȧ     A
  ȧ     a
  Ȩ     E
@@ -234,6 +311,128 @@
  ȯ     o
  Ȳ     Y
  ȳ     y
+ȴ     l
+ȵ     n
+ȶ     t
+ȷ     j
+ȸ     db
+ȹ     qp
+Ⱥ     A
+Ȼ     C
+ȼ     c
+Ƚ     L
+Ⱦ     T
+ȿ     s
+ɀ     z
+Ƀ     B
+Ʉ     U
+Ɇ     E
+ɇ     e
+Ɉ     J
+ɉ     j
+Ɍ     R
+ɍ     r
+Ɏ     Y
+ɏ     y
+ɓ     b
+ɕ     c
+ɖ     d
+ɗ     d
+ɛ     e
+ɟ     j
+ɠ     g
+ɡ     g
+ɢ     G
+ɦ     h
+ɧ     h
+ɨ     i
+ɪ     I
+ɫ     l
+ɬ     l
+ɭ     l
+ɱ     m
+ɲ     n
+ɳ     n
+ɴ     N
+ɶ     OE
+ɼ     r
+ɽ     r
+ɾ     r
+ʀ     R
+ʂ     s
+ʈ     t
+ʉ     u
+ʋ     v
+ʏ     Y
+ʐ     z
+ʑ     z
+ʙ     B
+ʛ     G
+ʜ     H
+ʝ     j
+ʟ     L
+ʠ     q
+ʣ     dz
+ʥ     dz
+ʦ     ts
+ʪ     ls
+ʫ     lz
+Ё     Е
+ё     е
+ᴀ    A
+ᴁ    AE
+ᴃ    B
+ᴄ    C
+ᴅ    D
+ᴆ    D
+ᴇ    E
+ᴊ    J
+ᴋ    K
+ᴌ    L
+ᴍ    M
+ᴏ    O
+ᴘ    P
+ᴛ    T
+ᴜ    U
+ᴠ    V
+ᴡ    W
+ᴢ    Z
+ᵫ    ue
+ᵬ    b
+ᵭ    d
+ᵮ    f
+ᵯ    m
+ᵰ    n
+ᵱ    p
+ᵲ    r
+ᵳ    r
+ᵴ    s
+ᵵ    t
+ᵶ    z
+ᵺ    th
+ᵻ    I
+ᵽ    p
+ᵾ    U
+ᶀ    b
+ᶁ    d
+ᶂ    f
+ᶃ    g
+ᶄ    k
+ᶅ    l
+ᶆ    m
+ᶇ    n
+ᶈ    p
+ᶉ    r
+ᶊ    s
+ᶌ    v
+ᶍ    x
+ᶎ    z
+ᶏ    a
+ᶑ    d
+ᶒ    e
+ᶓ    e
+ᶖ    i
+ᶙ    u
  Ḁ    A
  ḁ    a
  Ḃ    B
@@ -356,6 +555,10 @@
  ẗ    t
  ẘ    w
  ẙ    y
+ẚ    a
+ẜ    s
+ẝ    s
+ẞ    SS
  Ạ    A
  ạ    a
  Ả    A
@@ -386,28 +589,461 @@
  ỷ    y
  Ỹ    Y
  ỹ    y
+Ỻ    LL
+ỻ    ll
+Ỽ    V
+ỽ    v
+Ỿ    Y
+ỿ    y
+‐    -
+‑    -
+‒    -
+–    -
+—    -
+―    -
+‖    ||
+‘    '
+’    '
+‚    ,
+‛    '
+“    "
+”    "
+„    ,,
+‟    "
+․    .
+‥    ..
+…    ...
+′    '
+″    "
+‹    <
+›    >
+‼    !!
+⁄    /
+⁅    [
+⁆    ]
+⁇    ??
+⁈    ?!
+⁉    !?
+⁎    *
+₠    CE
+₢    Cr
+₣    Fr.
+₤    L.
+₧    Pts
+₹    Rs
+₺    TL
+℀    a/c
+℁    a/s
+ℂ    C
+℃    °C
+℅    c/o
+℆    c/u
+℉    °F
+ℊ    g
+ℋ    H
+ℌ    x
+ℍ    H
+ℎ    h
+ℐ    I
+ℑ    I
+ℒ    L
+ℓ    l
+ℕ    N
+№    No
+℗    (P)
+ℙ    P
+ℚ    Q
+ℛ    R
+ℜ    R
+ℝ    R
+℞    Rx
+℡    TEL
+ℤ    Z
+ℨ    Z
+ℬ    B
+ℭ    C
+ℯ    e
+ℰ    E
+ℱ    F
+ℳ    M
+ℴ    o
+ℹ    i
+℻    FAX
+ⅅ    D
+ⅆ    d
+ⅇ    e
+ⅈ    i
+ⅉ    j
+⅓     1/3
+⅔     2/3
+⅕     1/5
+⅖     2/5
+⅗     3/5
+⅘     4/5
+⅙     1/6
+⅚     5/6
+⅛     1/8
+⅜     3/8
+⅝     5/8
+⅞     7/8
+⅟     1/
+Ⅰ    I
+Ⅱ    II
+Ⅲ    III
+Ⅳ    IV
+Ⅴ    V
+Ⅵ    VI
+Ⅶ    VII
+Ⅷ    VIII
+Ⅸ    IX
+Ⅹ    X
+Ⅺ    XI
+Ⅻ    XII
+Ⅼ    L
+Ⅽ    C
+Ⅾ    D
+Ⅿ    M
+ⅰ    i
+ⅱ    ii
+ⅲ    iii
+ⅳ    iv
+ⅴ    v
+ⅵ    vi
+ⅶ    vii
+ⅷ    viii
+ⅸ    ix
+ⅹ    x
+ⅺ    xi
+ⅻ    xii
+ⅼ    l
+ⅽ    c
+ⅾ    d
+ⅿ    m
+−    -
+∕    /
+∖    \
+∣    |
+∥    ||
+≪    <<
+≫    >>
+⑴    (1)
+⑵    (2)
+⑶    (3)
+⑷    (4)
+⑸    (5)
+⑹    (6)
+⑺    (7)
+⑻    (8)
+⑼    (9)
+⑽    (10)
+⑾    (11)
+⑿    (12)
+⒀    (13)
+⒁    (14)
+⒂    (15)
+⒃    (16)
+⒄    (17)
+⒅    (18)
+⒆    (19)
+⒇    (20)
+⒈    1.
+⒉    2.
+⒊    3.
+⒋    4.
+⒌    5.
+⒍    6.
+⒎    7.
+⒏    8.
+⒐    9.
+⒑    10.
+⒒    11.
+⒓    12.
+⒔    13.
+⒕    14.
+⒖    15.
+⒗    16.
+⒘    17.
+⒙    18.
+⒚    19.
+⒛    20.
+⒜    (a)
+⒝    (b)
+⒞    (c)
+⒟    (d)
+⒠    (e)
+⒡    (f)
+⒢    (g)
+⒣    (h)
+⒤    (i)
+⒥    (j)
+⒦    (k)
+⒧    (l)
+⒨    (m)
+⒩    (n)
+⒪    (o)
+⒫    (p)
+⒬    (q)
+⒭    (r)
+⒮    (s)
+⒯    (t)
+⒰    (u)
+⒱    (v)
+⒲    (w)
+⒳    (x)
+⒴    (y)
+⒵    (z)
+⦅    ((
+⦆    ))
+⩴    ::=
+⩵    ==
+⩶    ===
+、    ,
+。    .
+〇    0
+〈    <
+〉    >
+《    <<
+》    >>
+〔    [
+〕    ]
+〘    [
+〙    ]
+〚    [
+〛    ]
+〝    "
+〞    "
+㍱    hPa
+㍲    da
+㍳    AU
+㍴    bar
+㍵    oV
+㍶    pc
+㍷    dm
+㍺    IU
+㎀    pA
+㎁    nA
+㎃    mA
+㎄    kA
+㎅    KB
+㎆    MB
+㎇    GB
+㎈    cal
+㎉    kcal
+㎊    pF
+㎋    nF
+㎎    mg
+㎏    kg
+㎐    Hz
+㎑    kHz
+㎒    MHz
+㎓    GHz
+㎔    THz
+㎙    fm
+㎚    nm
+㎜    mm
+㎝    cm
+㎞    km
+㎧    m/s
+㎩    Pa
+㎪    kPa
+㎫    MPa
+㎬    GPa
+㎭    rad
+㎮    rad/s
+㎰    ps
+㎱    ns
+㎳    ms
+㎴    pV
+㎵    nV
+㎷    mV
+㎸    kV
+㎹    MV
+㎺    pW
+㎻    nW
+㎽    mW
+㎾    kW
+㎿    MW
+㏂    a.m.
+㏃    Bq
+㏄    cc
+㏅    cd
+㏆    C/kg
+㏇    Co.
+㏈    dB
+㏉    Gy
+㏊    ha
+㏋    HP
+㏌    in
+㏍    KK
+㏎    KM
+㏏    kt
+㏐    lm
+㏑    ln
+㏒    log
+㏓    lx
+㏔    mb
+㏕    mil
+㏖    mol
+㏗    pH
+㏘    p.m.
+㏙    PPM
+㏚    PR
+㏛    sr
+㏜    Sv
+㏝    Wb
+㏞    V/m
+㏟    A/m
  ﬀ    ff
  ﬁ    fi
  ﬂ    fl
  ﬃ    ffi
  ﬄ    ffl
+ﬅ    st
  ﬆ    st
-Ø     O
-ø     o
-Đ     D
-đ     d
-ı     i
-Ħ     H
-ħ     h
-Ł     L
-ł     l
-ŉ     'n
-Ŧ     T
-ŧ     t
-Ё     Е
-ё     е
-Æ     AE
-ß     ss
-æ     ae
-Œ     OE
-œ     oe
+︐    ,
+︑    ,
+︒    .
+︓    :
+︔    ;
+︕    !
+︖    ?
+︙    ...
+︰    ..
+︱    -
+︲    -
+︵    (
+︶    )
+︷    {
+︸    }
+︹    [
+︺    ]
+︽    <<
+︾    >>
+︿    <
+﹀    >
+﹇    [
+﹈    ]
+﹐    ,
+﹑    ,
+﹒    .
+﹔    ;
+﹕    :
+﹖    ?
+﹗    !
+﹘    -
+﹙    (
+﹚    )
+﹛    {
+﹜    }
+﹝    [
+﹞    ]
+﹟    #
+﹠    &
+﹡    *
+﹢    +
+﹣    -
+﹤    <
+﹥    >
+﹦    =
+﹨    \
+﹩    $
+﹪    %
+﹫    @
+！    !
+＂    "
+＃    #
+＄    $
+％    %
+＆    &
+＇    '
+（    (
+）    )
+＊    *
+＋    +
+，    ,
+－    -
+．    .
+／    /
+０    0
+１    1
+２    2
+３    3
+４    4
+５    5
+６    6
+７    7
+８    8
+９    9
+：    :
+；    ;
+＜    <
+＝    =
+＞    >
+？    ?
+＠    @
+Ａ    A
+Ｂ    B
+Ｃ    C
+Ｄ    D
+Ｅ    E
+Ｆ    F
+Ｇ    G
+Ｈ    H
+Ｉ    I
+Ｊ    J
+Ｋ    K
+Ｌ    L
+Ｍ    M
+Ｎ    N
+Ｏ    O
+Ｐ    P
+Ｑ    Q
+Ｒ    R
+Ｓ    S
+Ｔ    T
+Ｕ    U
+Ｖ    V
+Ｗ    W
+Ｘ    X
+Ｙ    Y
+Ｚ    Z
+［    [
+＼    \
+］    ]
+＾    ^
+＿    _
+｀    `
+ａ    a
+ｂ    b
+ｃ    c
+ｄ    d
+ｅ    e
+ｆ    f
+ｇ    g
+ｈ    h
+ｉ    i
+ｊ    j
+ｋ    k
+ｌ    l
+ｍ    m
+ｎ    n
+ｏ    o
+ｐ    p
+ｑ    q
+ｒ    r
+ｓ    s
+ｔ    t
+ｕ    u
+ｖ    v
+ｗ    w
+ｘ    x
+ｙ    y
+ｚ    z
+｛    {
+｜    |
+｝    }
+～    ~
+｟    ((
+｠    ))
+｡    .
+､    ,
author	Teodor Sigaev <teodor@sigaev.ru>
	Wed, 16 Mar 2016 13:47:03 +0000 (16:47 +0300)
committer	Teodor Sigaev <teodor@sigaev.ru>
	Wed, 16 Mar 2016 13:47:03 +0000 (16:47 +0300)
contrib/unaccent/generate_unaccent_rules.py		patch \| blob \| history
contrib/unaccent/unaccent.rules		patch \| blob \| history