Make unaccent handle all diacritics known to Unicode, and expand ligatures correctly

author Teodor Sigaev <teodor@sigaev.ru>

Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)

committer Teodor Sigaev <teodor@sigaev.ru>

Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
author Teodor Sigaev <teodor@sigaev.ru>
Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
committer Teodor Sigaev <teodor@sigaev.ru>
Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py

new file mode 100644 (file)

index 0000000..b838d8f
--- /dev/null
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+#
+# This script builds unaccent.rules on standard output when given the
+# contents of UnicodeData.txt[1] on standard input.  Optionally includes
+# ligature expansion, if --expand-ligatures is given on the command line.
+#
+# The approach is to use the Unicode decomposition data to identify
+# precomposed codepoints that are equivalent to a ligature of several
+# letters, or a base letter with any number of diacritical marks.
+# There is also a small set of special cases for codepoints that we
+# traditionally support even though Unicode doesn't consider them to
+# be ligatures or letters with marks.
+#
+# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
+
+import re
+import sys
+
+def print_record(codepoint, letter):
+    print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
+
+class Codepoint:
+    def __init__(self, id, general_category, combining_ids):
+        self.id = id
+        self.general_category = general_category
+        self.combining_ids = combining_ids
+
+def is_plain_letter(codepoint):
+    """Return true if codepoint represents a plain ASCII letter."""
+    return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
+           (codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
+
+def is_mark(codepoint):
+    """Returns true for diacritical marks (combining codepoints)."""
+    return codepoint.general_category in ("Mn", "Me", "Mc")
+
+def is_letter_with_marks(codepoint, table):
+    """Returns true for plain letters combined with one or more marks."""
+    # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
+    return len(codepoint.combining_ids) > 1 and \
+           is_plain_letter(table[codepoint.combining_ids[0]]) and \
+           all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
+
+def is_letter(codepoint, table):
+    """Return true for letter with or without diacritical marks."""
+    return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
+
+def get_plain_letter(codepoint, table):
+    """Return the base codepoint without marks."""
+    if is_letter_with_marks(codepoint, table):
+        return table[codepoint.combining_ids[0]]
+    elif is_plain_letter(codepoint):
+        return codepoint
+    else:
+        raise "mu"
+
+def is_ligature(codepoint, table):
+    """Return true for letters combined with letters."""
+    return all(is_letter(table[i], table) for i in codepoint.combining_ids)
+
+def get_plain_letters(codepoint, table):
+    """Return a list of plain letters from a ligature."""
+    assert(is_ligature(codepoint, table))
+    return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
+
+def main(expand_ligatures):
+    # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
+    decomposition_type_pattern = re.compile(" *<[^>]*> *")
+
+    table = {}
+    all = []
+
+    # read everything we need into memory
+    for line in sys.stdin.readlines():
+        fields = line.split(";")
+        if len(fields) > 5:
+            # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
+            general_category = fields[2]
+            decomposition = fields[5]
+            decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
+            id = int(fields[0], 16)
+            combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
+            codepoint = Codepoint(id, general_category, combining_ids)
+            table[id] = codepoint
+            all.append(codepoint)
+
+    # walk through all the codepoints looking for interesting mappings
+    for codepoint in all:
+        if codepoint.general_category.startswith('L') and \
+           len(codepoint.combining_ids) > 1:
+            if is_letter_with_marks(codepoint, table):
+                print_record(codepoint.id,
+                             chr(get_plain_letter(codepoint, table).id))
+            elif expand_ligatures and is_ligature(codepoint, table):
+                print_record(codepoint.id,
+                             "".join(unichr(combining_codepoint.id)
+                                     for combining_codepoint \
+                                     in get_plain_letters(codepoint, table)))
+
+    # some special cases
+    print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
+    print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
+    print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
+    print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
+    print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
+    print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
+    print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
+    print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
+    print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
+    print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+    print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
+    print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
+    print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
+    print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
+    if expand_ligatures:
+        print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
+        print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
+        print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
+        print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
+        print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
+
+if __name__ == "__main__":
+    main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules

index cc2f7a65858d7e265850cea4592528c3866ca698..73c24a188badf9dfcbf5ab3950841232014e3c22 100644 (file)
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -4,22 +4,59 @@
  Ã     A
  Ä     A
  Å     A
-Æ     A
+Ç     C
+È     E
+É     E
+Ê     E
+Ë     E
+Ì     I
+Í     I
+Î     I
+Ï     I
+Ñ     N
+Ò     O
+Ó     O
+Ô     O
+Õ     O
+Ö     O
+Ù     U
+Ú     U
+Û     U
+Ü     U
+Ý     Y
  à     a
  á     a
  â     a
  ã     a
  ä     a
  å     a
-æ     a
+ç     c
+è     e
+é     e
+ê     e
+ë     e
+ì     i
+í     i
+î     i
+ï     i
+ñ     n
+ò     o
+ó     o
+ô     o
+õ     o
+ö     o
+ù     u
+ú     u
+û     u
+ü     u
+ý     y
+ÿ     y
  Ā     A
  ā     a
  Ă     A
  ă     a
  Ą     A
  ą     a
-Ç     C
-ç     c
  Ć     C
  ć     c
  Ĉ     C
@@ -30,16 +67,6 @@
  č     c
  Ď     D
  ď     d
-Đ     D
-đ     d
-È     E
-É     E
-Ê     E
-Ë     E
-è     e
-é     e
-ê     e
-ë     e
  Ē     E
  ē     e
  Ĕ     E
@@ -60,17 +87,7 @@
  ģ     g
  Ĥ     H
  ĥ     h
-Ħ     H
-ħ     h
  Ĩ     I
-Ì     I
-Í     I
-Î     I
-Ï     I
-ì     i
-í     i
-î     i
-ï     i
  ĩ     i
  Ī     I
  ī     i
@@ -79,62 +96,36 @@
  Į     I
  į     i
  İ     I
-ı     i
-Ĳ     I
-ĳ     i
+Ĳ     IJ
+ĳ     ij
  Ĵ     J
  ĵ     j
  Ķ     K
  ķ     k
-ĸ     k
  Ĺ     L
  ĺ     l
  Ļ     L
  ļ     l
  Ľ     L
  ľ     l
-Ŀ     L
-ŀ     l
-Ł     L
-ł     l
-Ñ     N
-ñ     n
  Ń     N
  ń     n
  Ņ     N
  ņ     n
  Ň     N
  ň     n
-ŉ     n
-Ŋ     N
-ŋ     n
-Ò     O
-Ó     O
-Ô     O
-Õ     O
-Ö     O
-ò     o
-ó     o
-ô     o
-õ     o
-ö     o
  Ō     O
  ō     o
  Ŏ     O
  ŏ     o
  Ő     O
  ő     o
-Œ     E
-œ     e
-Ø     O
-ø     o
  Ŕ     R
  ŕ     r
  Ŗ     R
  ŗ     r
  Ř     R
  ř     r
-ß     S
  Ś     S
  ś     s
  Ŝ     S
@@ -147,16 +138,6 @@
  ţ     t
  Ť     T
  ť     t
-Ŧ     T
-ŧ     t
-Ù     U
-Ú     U
-Û     U
-Ü     U
-ù     u
-ú     u
-û     u
-ü     u
  Ũ     U
  ũ     u
  Ū     U
@@ -171,9 +152,6 @@
  ų     u
  Ŵ     W
  ŵ     w
-Ý     Y
-ý     y
-ÿ     y
  Ŷ     Y
  ŷ     y
  Ÿ     Y
@@ -183,5 +161,253 @@
  ż     z
  Ž     Z
  ž     z
-ё     е
+Ơ     O
+ơ     o
+Ư     U
+ư     u
+Ǆ     DZ
+ǅ     Dz
+ǆ     dz
+Ǉ     LJ
+ǈ     Lj
+ǉ     lj
+Ǌ     NJ
+ǋ     Nj
+ǌ     nj
+Ǎ     A
+ǎ     a
+Ǐ     I
+ǐ     i
+Ǒ     O
+ǒ     o
+Ǔ     U
+ǔ     u
+Ǧ     G
+ǧ     g
+Ǩ     K
+ǩ     k
+Ǫ     O
+ǫ     o
+ǰ     j
+Ǳ     DZ
+ǲ     Dz
+ǳ     dz
+Ǵ     G
+ǵ     g
+Ǹ     N
+ǹ     n
+Ȁ     A
+ȁ     a
+Ȃ     A
+ȃ     a
+Ȅ     E
+ȅ     e
+Ȇ     E
+ȇ     e
+Ȉ     I
+ȉ     i
+Ȋ     I
+ȋ     i
+Ȍ     O
+ȍ     o
+Ȏ     O
+ȏ     o
+Ȑ     R
+ȑ     r
+Ȓ     R
+ȓ     r
+Ȕ     U
+ȕ     u
+Ȗ     U
+ȗ     u
+Ș     S
+ș     s
+Ț     T
+ț     t
+Ȟ     H
+ȟ     h
+Ȧ     A
+ȧ     a
+Ȩ     E
+ȩ     e
+Ȯ     O
+ȯ     o
+Ȳ     Y
+ȳ     y
+Ḁ    A
+ḁ    a
+Ḃ    B
+ḃ    b
+Ḅ    B
+ḅ    b
+Ḇ    B
+ḇ    b
+Ḋ    D
+ḋ    d
+Ḍ    D
+ḍ    d
+Ḏ    D
+ḏ    d
+Ḑ    D
+ḑ    d
+Ḓ    D
+ḓ    d
+Ḙ    E
+ḙ    e
+Ḛ    E
+ḛ    e
+Ḟ    F
+ḟ    f
+Ḡ    G
+ḡ    g
+Ḣ    H
+ḣ    h
+Ḥ    H
+ḥ    h
+Ḧ    H
+ḧ    h
+Ḩ    H
+ḩ    h
+Ḫ    H
+ḫ    h
+Ḭ    I
+ḭ    i
+Ḱ    K
+ḱ    k
+Ḳ    K
+ḳ    k
+Ḵ    K
+ḵ    k
+Ḷ    L
+ḷ    l
+Ḻ    L
+ḻ    l
+Ḽ    L
+ḽ    l
+Ḿ    M
+ḿ    m
+Ṁ    M
+ṁ    m
+Ṃ    M
+ṃ    m
+Ṅ    N
+ṅ    n
+Ṇ    N
+ṇ    n
+Ṉ    N
+ṉ    n
+Ṋ    N
+ṋ    n
+Ṕ    P
+ṕ    p
+Ṗ    P
+ṗ    p
+Ṙ    R
+ṙ    r
+Ṛ    R
+ṛ    r
+Ṟ    R
+ṟ    r
+Ṡ    S
+ṡ    s
+Ṣ    S
+ṣ    s
+Ṫ    T
+ṫ    t
+Ṭ    T
+ṭ    t
+Ṯ    T
+ṯ    t
+Ṱ    T
+ṱ    t
+Ṳ    U
+ṳ    u
+Ṵ    U
+ṵ    u
+Ṷ    U
+ṷ    u
+Ṽ    V
+ṽ    v
+Ṿ    V
+ṿ    v
+Ẁ    W
+ẁ    w
+Ẃ    W
+ẃ    w
+Ẅ    W
+ẅ    w
+Ẇ    W
+ẇ    w
+Ẉ    W
+ẉ    w
+Ẋ    X
+ẋ    x
+Ẍ    X
+ẍ    x
+Ẏ    Y
+ẏ    y
+Ẑ    Z
+ẑ    z
+Ẓ    Z
+ẓ    z
+Ẕ    Z
+ẕ    z
+ẖ    h
+ẗ    t
+ẘ    w
+ẙ    y
+Ạ    A
+ạ    a
+Ả    A
+ả    a
+Ẹ    E
+ẹ    e
+Ẻ    E
+ẻ    e
+Ẽ    E
+ẽ    e
+Ỉ    I
+ỉ    i
+Ị    I
+ị    i
+Ọ    O
+ọ    o
+Ỏ    O
+ỏ    o
+Ụ    U
+ụ    u
+Ủ    U
+ủ    u
+Ỳ    Y
+ỳ    y
+Ỵ    Y
+ỵ    y
+Ỷ    Y
+ỷ    y
+Ỹ    Y
+ỹ    y
+ﬀ    ff
+ﬁ    fi
+ﬂ    fl
+ﬃ    ffi
+ﬄ    ffl
+ﬆ    st
+Ø     O
+ø     o
+Đ     D
+đ     d
+ı     i
+Ħ     H
+ħ     h
+Ł     L
+ł     l
+ŉ     'n
+Ŧ     T
+ŧ     t
  Ё     Е
+ё     е
+Æ     AE
+ß     ss
+æ     ae
+Œ     OE
+œ     oe
author	Teodor Sigaev <teodor@sigaev.ru>
	Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
committer	Teodor Sigaev <teodor@sigaev.ru>
	Fri, 4 Sep 2015 09:51:53 +0000 (12:51 +0300)
contrib/unaccent/generate_unaccent_rules.py	[new file with mode: 0644]	patch \| blob
contrib/unaccent/unaccent.rules		patch \| blob \| history