From ec0a69e49bf41a37b5c2d6f6be66d8abae00ee05 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 16 Aug 2017 16:51:56 -0400 Subject: [PATCH] Extend the default rules file for contrib/unaccent with Vietnamese letters. Improve generate_unaccent_rules.py to handle composed characters whose base is another composed character rather than a plain letter. The net effect of this is to add a bunch of multi-accented Vietnamese characters to unaccent.rules. Original complaint from Kha Nguyen, diagnosis of the script's shortcoming by Thomas Munro. Dang Minh Huong and Michael Paquier Discussion: https://postgr.es/m/CALo3sF6EC8cy1F2JUz=GRf5h4LMUJTaG3qpdoiLrNbWEXL-tRg@mail.gmail.com --- contrib/unaccent/generate_unaccent_rules.py | 39 +++++-- contrib/unaccent/unaccent.rules | 114 ++++++++++++++++++++ 2 files changed, 145 insertions(+), 8 deletions(-) diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index a5eb42f0b1..4b1b011861 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -48,24 +48,47 @@ def is_mark(codepoint): return codepoint.general_category in ("Mn", "Me", "Mc") def is_letter_with_marks(codepoint, table): - """Returns true for plain letters combined with one or more marks.""" + """Returns true for letters combined with one or more marks.""" # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values - return len(codepoint.combining_ids) > 1 and \ - is_plain_letter(table[codepoint.combining_ids[0]]) and \ - all(is_mark(table[i]) for i in codepoint.combining_ids[1:]) + + # Letter may have no combining characters, in which case it has + # no marks. + if len(codepoint.combining_ids) == 1: + return False + + # A letter without diacritical marks has none of them. + if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False: + return False + + # Check if the base letter of this letter has marks. + codepoint_base = codepoint.combining_ids[0] + if (is_plain_letter(table[codepoint_base]) is False and \ + is_letter_with_marks(table[codepoint_base], table) is False): + return False + + return True def is_letter(codepoint, table): """Return true for letter with or without diacritical marks.""" return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table) def get_plain_letter(codepoint, table): - """Return the base codepoint without marks.""" + """Return the base codepoint without marks. If this codepoint has more + than one combining character, do a recursive lookup on the table to + find out its plain base letter.""" if is_letter_with_marks(codepoint, table): - return table[codepoint.combining_ids[0]] + if len(table[codepoint.combining_ids[0]].combining_ids) > 1: + return get_plain_letter(table[codepoint.combining_ids[0]], table) + elif is_plain_letter(table[codepoint.combining_ids[0]]): + return table[codepoint.combining_ids[0]] + + # Should not come here + assert(False) elif is_plain_letter(codepoint): return codepoint - else: - raise "mu" + + # Should not come here + assert(False) def is_ligature(codepoint, table): """Return true for letters combined with letters.""" diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index 84886da587..97f9ed47cf 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -254,6 +254,18 @@ ǒ o Ǔ U ǔ u +Ǖ U +ǖ u +Ǘ U +ǘ u +Ǚ U +ǚ u +Ǜ U +ǜ u +Ǟ A +ǟ a +Ǡ A +ǡ a Ǥ G ǥ g Ǧ G @@ -262,6 +274,8 @@ ǩ k Ǫ O ǫ o +Ǭ O +ǭ o ǰ j DZ DZ Dz Dz @@ -270,6 +284,8 @@ ǵ g Ǹ N ǹ n +Ǻ A +ǻ a Ȁ A ȁ a Ȃ A @@ -307,8 +323,14 @@ ȧ a Ȩ E ȩ e +Ȫ O +ȫ o +Ȭ O +ȭ o Ȯ O ȯ o +Ȱ O +ȱ o Ȳ Y ȳ y ȴ l @@ -441,6 +463,8 @@ ḅ b Ḇ B ḇ b +Ḉ C +ḉ c Ḋ D ḋ d Ḍ D @@ -451,10 +475,16 @@ ḑ d Ḓ D ḓ d +Ḕ E +ḕ e +Ḗ E +ḗ e Ḙ E ḙ e Ḛ E ḛ e +Ḝ E +ḝ e Ḟ F ḟ f Ḡ G @@ -471,6 +501,8 @@ ḫ h Ḭ I ḭ i +Ḯ I +ḯ i Ḱ K ḱ k Ḳ K @@ -479,6 +511,8 @@ ḵ k Ḷ L ḷ l +Ḹ L +ḹ l Ḻ L ḻ l Ḽ L @@ -497,6 +531,14 @@ ṉ n Ṋ N ṋ n +Ṍ O +ṍ o +Ṏ O +ṏ o +Ṑ O +ṑ o +Ṓ O +ṓ o Ṕ P ṕ p Ṗ P @@ -505,12 +547,20 @@ ṙ r Ṛ R ṛ r +Ṝ R +ṝ r Ṟ R ṟ r Ṡ S ṡ s Ṣ S ṣ s +Ṥ S +ṥ s +Ṧ S +ṧ s +Ṩ S +ṩ s Ṫ T ṫ t Ṭ T @@ -525,6 +575,10 @@ ṵ u Ṷ U ṷ u +Ṹ U +ṹ u +Ṻ U +ṻ u Ṽ V ṽ v Ṿ V @@ -563,12 +617,42 @@ ạ a Ả A ả a +Ấ A +ấ a +Ầ A +ầ a +Ẩ A +ẩ a +Ẫ A +ẫ a +Ậ A +ậ a +Ắ A +ắ a +Ằ A +ằ a +Ẳ A +ẳ a +Ẵ A +ẵ a +Ặ A +ặ a Ẹ E ẹ e Ẻ E ẻ e Ẽ E ẽ e +Ế E +ế e +Ề E +ề e +Ể E +ể e +Ễ E +ễ e +Ệ E +ệ e Ỉ I ỉ i Ị I @@ -577,10 +661,40 @@ ọ o Ỏ O ỏ o +Ố O +ố o +Ồ O +ồ o +Ổ O +ổ o +Ỗ O +ỗ o +Ộ O +ộ o +Ớ O +ớ o +Ờ O +ờ o +Ở O +ở o +Ỡ O +ỡ o +Ợ O +ợ o Ụ U ụ u Ủ U ủ u +Ứ U +ứ u +Ừ U +ừ u +Ử U +ử u +Ữ U +ữ u +Ự U +ự u Ỳ Y ỳ y Ỵ Y -- 2.40.0