]> granicus.if.org Git - postgresql/commitdiff
Make unaccent handle all diacritics known to Unicode, and expand ligatures correctly
authorTeodor Sigaev <teodor@sigaev.ru>
Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
committerTeodor Sigaev <teodor@sigaev.ru>
Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
Add Python script for buiding unaccent.rules from Unicode data. Don't
backpatch because unaccent changes may require tsvector/index
rebuild.

Thomas Munro <thomas.munro@enterprisedb.com>

contrib/unaccent/generate_unaccent_rules.py [new file with mode: 0644]
contrib/unaccent/unaccent.rules

diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
new file mode 100644 (file)
index 0000000..b838d8f
--- /dev/null
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+#
+# This script builds unaccent.rules on standard output when given the
+# contents of UnicodeData.txt[1] on standard input.  Optionally includes
+# ligature expansion, if --expand-ligatures is given on the command line.
+#
+# The approach is to use the Unicode decomposition data to identify
+# precomposed codepoints that are equivalent to a ligature of several
+# letters, or a base letter with any number of diacritical marks.
+# There is also a small set of special cases for codepoints that we
+# traditionally support even though Unicode doesn't consider them to
+# be ligatures or letters with marks.
+#
+# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
+
+import re
+import sys
+
+def print_record(codepoint, letter):
+    print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
+
+class Codepoint:
+    def __init__(self, id, general_category, combining_ids):
+        self.id = id
+        self.general_category = general_category
+        self.combining_ids = combining_ids
+
+def is_plain_letter(codepoint):
+    """Return true if codepoint represents a plain ASCII letter."""
+    return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
+           (codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
+
+def is_mark(codepoint):
+    """Returns true for diacritical marks (combining codepoints)."""
+    return codepoint.general_category in ("Mn", "Me", "Mc")
+
+def is_letter_with_marks(codepoint, table):
+    """Returns true for plain letters combined with one or more marks."""
+    # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
+    return len(codepoint.combining_ids) > 1 and \
+           is_plain_letter(table[codepoint.combining_ids[0]]) and \
+           all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
+
+def is_letter(codepoint, table):
+    """Return true for letter with or without diacritical marks."""
+    return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
+
+def get_plain_letter(codepoint, table):
+    """Return the base codepoint without marks."""
+    if is_letter_with_marks(codepoint, table):
+        return table[codepoint.combining_ids[0]]
+    elif is_plain_letter(codepoint):
+        return codepoint
+    else:
+        raise "mu"
+
+def is_ligature(codepoint, table):
+    """Return true for letters combined with letters."""
+    return all(is_letter(table[i], table) for i in codepoint.combining_ids)
+
+def get_plain_letters(codepoint, table):
+    """Return a list of plain letters from a ligature."""
+    assert(is_ligature(codepoint, table))
+    return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
+
+def main(expand_ligatures):
+    # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
+    decomposition_type_pattern = re.compile(" *<[^>]*> *")
+
+    table = {}
+    all = []
+
+    # read everything we need into memory
+    for line in sys.stdin.readlines():
+        fields = line.split(";")
+        if len(fields) > 5:
+            # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
+            general_category = fields[2]
+            decomposition = fields[5]
+            decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
+            id = int(fields[0], 16)
+            combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
+            codepoint = Codepoint(id, general_category, combining_ids)
+            table[id] = codepoint
+            all.append(codepoint)
+
+    # walk through all the codepoints looking for interesting mappings
+    for codepoint in all:
+        if codepoint.general_category.startswith('L') and \
+           len(codepoint.combining_ids) > 1:
+            if is_letter_with_marks(codepoint, table):
+                print_record(codepoint.id,
+                             chr(get_plain_letter(codepoint, table).id))
+            elif expand_ligatures and is_ligature(codepoint, table):
+                print_record(codepoint.id,
+                             "".join(unichr(combining_codepoint.id)
+                                     for combining_codepoint \
+                                     in get_plain_letters(codepoint, table)))
+
+    # some special cases
+    print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
+    print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
+    print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
+    print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
+    print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
+    print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
+    print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
+    print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
+    print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
+    print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+    print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
+    print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
+    print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
+    print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
+    if expand_ligatures:
+        print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
+        print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
+        print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
+        print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
+        print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
+
+if __name__ == "__main__":
+    main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")
index cc2f7a65858d7e265850cea4592528c3866ca698..73c24a188badf9dfcbf5ab3950841232014e3c22 100644 (file)
@@ -4,22 +4,59 @@
 Ã     A
 Ä     A
 Å     A
-Æ     A
+Ç     C
+È     E
+É     E
+Ê     E
+Ë     E
+Ì     I
+Í     I
+Π    I
+Ï     I
+Ñ     N
+Ò     O
+Ó     O
+Ô     O
+Õ     O
+Ö     O
+Ù     U
+Ú     U
+Û     U
+Ü     U
+Ý     Y
 à     a
 á     a
 â     a
 ã     a
 ä     a
 å     a
-æ     a
+ç     c
+è     e
+é     e
+ê     e
+ë     e
+ì     i
+í     i
+î     i
+ï     i
+ñ     n
+ò     o
+ó     o
+ô     o
+õ     o
+ö     o
+ù     u
+ú     u
+û     u
+ü     u
+ý     y
+ÿ     y
 Ā     A
 ā     a
 Ă     A
 ă     a
 Ą     A
 ą     a
-Ç     C
-ç     c
 Ć     C
 ć     c
 Ĉ     C
 č     c
 Ď     D
 ď     d
-Đ     D
-đ     d
-È     E
-É     E
-Ê     E
-Ë     E
-è     e
-é     e
-ê     e
-ë     e
 Ē     E
 ē     e
 Ĕ     E
 ģ     g
 Ĥ     H
 ĥ     h
-Ħ     H
-ħ     h
 Ĩ     I
-Ì     I
-Í     I
-Π    I
-Ï     I
-ì     i
-í     i
-î     i
-ï     i
 ĩ     i
 Ī     I
 ī     i
 Į     I
 į     i
 İ     I
-ı     i
-IJ     I
-ij     i
+IJ     IJ
+ij     ij
 Ĵ     J
 ĵ     j
 Ķ     K
 ķ     k
-ĸ     k
 Ĺ     L
 ĺ     l
 Ļ     L
 ļ     l
 Ľ     L
 ľ     l
-Ŀ     L
-ŀ     l
-Ł     L
-ł     l
-Ñ     N
-ñ     n
 Ń     N
 ń     n
 Ņ     N
 ņ     n
 Ň     N
 ň     n
-ʼn     n
-Ŋ     N
-ŋ     n
-Ò     O
-Ó     O
-Ô     O
-Õ     O
-Ö     O
-ò     o
-ó     o
-ô     o
-õ     o
-ö     o
 Ō     O
 ō     o
 Ŏ     O
 ŏ     o
 Ő     O
 ő     o
-Œ     E
-œ     e
-Ø     O
-ø     o
 Ŕ     R
 ŕ     r
 Ŗ     R
 ŗ     r
 Ř     R
 ř     r
-ß     S
 Ś     S
 ś     s
 Ŝ     S
 ţ     t
 Ť     T
 ť     t
-Ŧ     T
-ŧ     t
-Ù     U
-Ú     U
-Û     U
-Ü     U
-ù     u
-ú     u
-û     u
-ü     u
 Ũ     U
 ũ     u
 Ū     U
 ų     u
 Ŵ     W
 ŵ     w
-Ý     Y
-ý     y
-ÿ     y
 Ŷ     Y
 ŷ     y
 Ÿ     Y
 ż     z
 Ž     Z
 ž     z
-ё     е
+Ơ     O
+ơ     o
+Ư     U
+ư     u
+DŽ     DZ
+Dž     Dz
+dž     dz
+LJ     LJ
+Lj     Lj
+lj     lj
+NJ     NJ
+Nj     Nj
+nj     nj
+Ǎ     A
+ǎ     a
+Ǐ     I
+ǐ     i
+Ǒ     O
+ǒ     o
+Ǔ     U
+ǔ     u
+Ǧ     G
+ǧ     g
+Ǩ     K
+ǩ     k
+Ǫ     O
+ǫ     o
+ǰ     j
+DZ     DZ
+Dz     Dz
+dz     dz
+Ǵ     G
+ǵ     g
+Ǹ     N
+ǹ     n
+Ȁ     A
+ȁ     a
+Ȃ     A
+ȃ     a
+Ȅ     E
+ȅ     e
+Ȇ     E
+ȇ     e
+Ȉ     I
+ȉ     i
+Ȋ     I
+ȋ     i
+Ȍ     O
+ȍ     o
+Ȏ     O
+ȏ     o
+Ȑ     R
+ȑ     r
+Ȓ     R
+ȓ     r
+Ȕ     U
+ȕ     u
+Ȗ     U
+ȗ     u
+Ș     S
+ș     s
+Ț     T
+ț     t
+Ȟ     H
+ȟ     h
+Ȧ     A
+ȧ     a
+Ȩ     E
+ȩ     e
+Ȯ     O
+ȯ     o
+Ȳ     Y
+ȳ     y
+Ḁ    A
+ḁ    a
+Ḃ    B
+ḃ    b
+Ḅ    B
+ḅ    b
+Ḇ    B
+ḇ    b
+Ḋ    D
+ḋ    d
+Ḍ    D
+ḍ    d
+Ḏ    D
+ḏ    d
+Ḑ    D
+ḑ    d
+Ḓ    D
+ḓ    d
+Ḙ    E
+ḙ    e
+Ḛ    E
+ḛ    e
+Ḟ    F
+ḟ    f
+Ḡ    G
+ḡ    g
+Ḣ    H
+ḣ    h
+Ḥ    H
+ḥ    h
+Ḧ    H
+ḧ    h
+Ḩ    H
+ḩ    h
+Ḫ    H
+ḫ    h
+Ḭ    I
+ḭ    i
+Ḱ    K
+ḱ    k
+Ḳ    K
+ḳ    k
+Ḵ    K
+ḵ    k
+Ḷ    L
+ḷ    l
+Ḻ    L
+ḻ    l
+Ḽ    L
+ḽ    l
+Ḿ    M
+ḿ    m
+Ṁ    M
+ṁ    m
+Ṃ    M
+ṃ    m
+Ṅ    N
+ṅ    n
+Ṇ    N
+ṇ    n
+Ṉ    N
+ṉ    n
+Ṋ    N
+ṋ    n
+Ṕ    P
+ṕ    p
+Ṗ    P
+ṗ    p
+Ṙ    R
+ṙ    r
+Ṛ    R
+ṛ    r
+Ṟ    R
+ṟ    r
+Ṡ    S
+ṡ    s
+Ṣ    S
+ṣ    s
+Ṫ    T
+ṫ    t
+Ṭ    T
+ṭ    t
+Ṯ    T
+ṯ    t
+Ṱ    T
+ṱ    t
+Ṳ    U
+ṳ    u
+Ṵ    U
+ṵ    u
+Ṷ    U
+ṷ    u
+Ṽ    V
+ṽ    v
+Ṿ    V
+ṿ    v
+Ẁ    W
+ẁ    w
+Ẃ    W
+ẃ    w
+Ẅ    W
+ẅ    w
+Ẇ    W
+ẇ    w
+Ẉ    W
+ẉ    w
+Ẋ    X
+ẋ    x
+Ẍ    X
+ẍ    x
+Ẏ    Y
+ẏ    y
+Ẑ    Z
+ẑ    z
+Ẓ    Z
+ẓ    z
+Ẕ    Z
+ẕ    z
+ẖ    h
+ẗ    t
+ẘ    w
+ẙ    y
+Ạ    A
+ạ    a
+Ả    A
+ả    a
+Ẹ    E
+ẹ    e
+Ẻ    E
+ẻ    e
+Ẽ    E
+ẽ    e
+Ỉ    I
+ỉ    i
+Ị    I
+ị    i
+Ọ    O
+ọ    o
+Ỏ    O
+ỏ    o
+Ụ    U
+ụ    u
+Ủ    U
+ủ    u
+Ỳ    Y
+ỳ    y
+Ỵ    Y
+ỵ    y
+Ỷ    Y
+ỷ    y
+Ỹ    Y
+ỹ    y
+ff    ff
+fi    fi
+fl    fl
+ffi    ffi
+ffl    ffl
+st    st
+Ø     O
+ø     o
+Đ     D
+đ     d
+ı     i
+Ħ     H
+ħ     h
+Ł     L
+ł     l
+ʼn     'n
+Ŧ     T
+ŧ     t
 Ё     Е
+ё     е
+Æ     AE
+ß     ss
+æ     ae
+Œ     OE
+œ     oe