]> granicus.if.org Git - postgresql/commitdiff
Extend the default rules file for contrib/unaccent with Vietnamese letters.
authorTom Lane <tgl@sss.pgh.pa.us>
Wed, 16 Aug 2017 20:51:56 +0000 (16:51 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Wed, 16 Aug 2017 20:51:56 +0000 (16:51 -0400)
Improve generate_unaccent_rules.py to handle composed characters whose base
is another composed character rather than a plain letter.  The net effect
of this is to add a bunch of multi-accented Vietnamese characters to
unaccent.rules.

Original complaint from Kha Nguyen, diagnosis of the script's shortcoming
by Thomas Munro.

Dang Minh Huong and Michael Paquier

Discussion: https://postgr.es/m/CALo3sF6EC8cy1F2JUz=GRf5h4LMUJTaG3qpdoiLrNbWEXL-tRg@mail.gmail.com

contrib/unaccent/generate_unaccent_rules.py
contrib/unaccent/unaccent.rules

index a5eb42f0b186f13ae303a792f7a8e702dad12d26..4b1b011861f1daeedefaa61cddb87656598b312a 100644 (file)
@@ -48,24 +48,47 @@ def is_mark(codepoint):
     return codepoint.general_category in ("Mn", "Me", "Mc")
 
 def is_letter_with_marks(codepoint, table):
-    """Returns true for plain letters combined with one or more marks."""
+    """Returns true for letters combined with one or more marks."""
     # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
-    return len(codepoint.combining_ids) > 1 and \
-           is_plain_letter(table[codepoint.combining_ids[0]]) and \
-           all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
+
+    # Letter may have no combining characters, in which case it has
+    # no marks.
+    if len(codepoint.combining_ids) == 1:
+        return False
+
+    # A letter without diacritical marks has none of them.
+    if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False:
+        return False
+
+    # Check if the base letter of this letter has marks.
+    codepoint_base = codepoint.combining_ids[0]
+    if (is_plain_letter(table[codepoint_base]) is False and \
+        is_letter_with_marks(table[codepoint_base], table) is False):
+        return False
+
+    return True
 
 def is_letter(codepoint, table):
     """Return true for letter with or without diacritical marks."""
     return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
 
 def get_plain_letter(codepoint, table):
-    """Return the base codepoint without marks."""
+    """Return the base codepoint without marks. If this codepoint has more
+    than one combining character, do a recursive lookup on the table to
+    find out its plain base letter."""
     if is_letter_with_marks(codepoint, table):
-        return table[codepoint.combining_ids[0]]
+        if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
+            return get_plain_letter(table[codepoint.combining_ids[0]], table)
+        elif is_plain_letter(table[codepoint.combining_ids[0]]):
+            return table[codepoint.combining_ids[0]]
+
+        # Should not come here
+        assert(False)
     elif is_plain_letter(codepoint):
         return codepoint
-    else:
-        raise "mu"
+
+    # Should not come here
+    assert(False)
 
 def is_ligature(codepoint, table):
     """Return true for letters combined with letters."""
index 84886da587aa8a0b4fd8e58fbc43a727f526b9b2..97f9ed47cfa8585b470b1b586bc7736545456f4a 100644 (file)
 ǒ     o
 Ǔ     U
 ǔ     u
+Ǖ     U
+ǖ     u
+Ǘ     U
+ǘ     u
+Ǚ     U
+ǚ     u
+Ǜ     U
+ǜ     u
+Ǟ     A
+ǟ     a
+Ǡ     A
+ǡ     a
 Ǥ     G
 ǥ     g
 Ǧ     G
 ǩ     k
 Ǫ     O
 ǫ     o
+Ǭ     O
+ǭ     o
 ǰ     j
 DZ     DZ
 Dz     Dz
 ǵ     g
 Ǹ     N
 ǹ     n
+Ǻ     A
+ǻ     a
 Ȁ     A
 ȁ     a
 Ȃ     A
 ȧ     a
 Ȩ     E
 ȩ     e
+Ȫ     O
+ȫ     o
+Ȭ     O
+ȭ     o
 Ȯ     O
 ȯ     o
+Ȱ     O
+ȱ     o
 Ȳ     Y
 ȳ     y
 ȴ     l
 ḅ    b
 Ḇ    B
 ḇ    b
+Ḉ    C
+ḉ    c
 Ḋ    D
 ḋ    d
 Ḍ    D
 ḑ    d
 Ḓ    D
 ḓ    d
+Ḕ    E
+ḕ    e
+Ḗ    E
+ḗ    e
 Ḙ    E
 ḙ    e
 Ḛ    E
 ḛ    e
+Ḝ    E
+ḝ    e
 Ḟ    F
 ḟ    f
 Ḡ    G
 ḫ    h
 Ḭ    I
 ḭ    i
+Ḯ    I
+ḯ    i
 Ḱ    K
 ḱ    k
 Ḳ    K
 ḵ    k
 Ḷ    L
 ḷ    l
+Ḹ    L
+ḹ    l
 Ḻ    L
 ḻ    l
 Ḽ    L
 ṉ    n
 Ṋ    N
 ṋ    n
+Ṍ    O
+ṍ    o
+Ṏ    O
+ṏ    o
+Ṑ    O
+ṑ    o
+Ṓ    O
+ṓ    o
 Ṕ    P
 ṕ    p
 Ṗ    P
 ṙ    r
 Ṛ    R
 ṛ    r
+Ṝ    R
+ṝ    r
 Ṟ    R
 ṟ    r
 Ṡ    S
 ṡ    s
 Ṣ    S
 ṣ    s
+Ṥ    S
+ṥ    s
+Ṧ    S
+ṧ    s
+Ṩ    S
+ṩ    s
 Ṫ    T
 ṫ    t
 Ṭ    T
 ṵ    u
 Ṷ    U
 ṷ    u
+Ṹ    U
+ṹ    u
+Ṻ    U
+ṻ    u
 Ṽ    V
 ṽ    v
 Ṿ    V
 ạ    a
 Ả    A
 ả    a
+Ấ    A
+ấ    a
+Ầ    A
+ầ    a
+Ẩ    A
+ẩ    a
+Ẫ    A
+ẫ    a
+Ậ    A
+ậ    a
+Ắ    A
+ắ    a
+Ằ    A
+ằ    a
+Ẳ    A
+ẳ    a
+Ẵ    A
+ẵ    a
+Ặ    A
+ặ    a
 Ẹ    E
 ẹ    e
 Ẻ    E
 ẻ    e
 Ẽ    E
 ẽ    e
+Ế    E
+ế    e
+Ề    E
+ề    e
+Ể    E
+ể    e
+Ễ    E
+ễ    e
+Ệ    E
+ệ    e
 Ỉ    I
 ỉ    i
 Ị    I
 ọ    o
 Ỏ    O
 ỏ    o
+Ố    O
+ố    o
+Ồ    O
+ồ    o
+Ổ    O
+ổ    o
+Ỗ    O
+ỗ    o
+Ộ    O
+ộ    o
+Ớ    O
+ớ    o
+Ờ    O
+ờ    o
+Ở    O
+ở    o
+Ỡ    O
+ỡ    o
+Ợ    O
+ợ    o
 Ụ    U
 ụ    u
 Ủ    U
 ủ    u
+Ứ    U
+ứ    u
+Ừ    U
+ừ    u
+Ử    U
+ử    u
+Ữ    U
+ữ    u
+Ự    U
+ự    u
 Ỳ    Y
 ỳ    y
 Ỵ    Y