Update unaccent rules with release 34 of CLDR for Latin-ASCII.xml

author Michael Paquier <michael@paquier.xyz>

Thu, 10 Jan 2019 05:10:21 +0000 (14:10 +0900)

committer Michael Paquier <michael@paquier.xyz>

Thu, 10 Jan 2019 05:10:21 +0000 (14:10 +0900)
author Michael Paquier <michael@paquier.xyz>
Thu, 10 Jan 2019 05:10:21 +0000 (14:10 +0900)
committer Michael Paquier <michael@paquier.xyz>
Thu, 10 Jan 2019 05:10:21 +0000 (14:10 +0900)
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out

index 0835e141afbaf5b8e2d747071d4e5cc26df3f707..69c2cf9bd7ab0af4656c20de86c9f00e09f6c478 100644 (file)
--- a/contrib/unaccent/expected/unaccent.out
+++ b/contrib/unaccent/expected/unaccent.out
@@ -25,6 +25,12 @@ SELECT unaccent('ЁЖИК');
   ЕЖИК
  (1 row)
  
+SELECT unaccent('˃˖˗˜');
+ unaccent 
+----------
+ >+-~
+(1 row)
+
  SELECT unaccent('unaccent', 'foobar');
   unaccent 
  ----------
@@ -43,6 +49,12 @@ SELECT unaccent('unaccent', 'ЁЖИК');
   ЕЖИК
  (1 row)
  
+SELECT unaccent('unaccent', '˃˖˗˜');
+ unaccent 
+----------
+ >+-~
+(1 row)
+
  SELECT ts_lexize('unaccent', 'foobar');
   ts_lexize 
  -----------
@@ -61,3 +73,9 @@ SELECT ts_lexize('unaccent', 'ЁЖИК');
   {ЕЖИК}
  (1 row)
  
+SELECT ts_lexize('unaccent', '˃˖˗˜');
+ ts_lexize 
+-----------
+ {>+-~}
+(1 row)
+
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py

index c9aef490aef1feeb15b75786ae3db62ded3068b0..4419a771edf919d20a317a9383c5f6ff3fa9b0ff 100644 (file)
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -20,8 +20,13 @@
  # option is enabled, the XML file of this transliterator [2] -- given as a
  # command line argument -- will be parsed and used.
  #
+# Ideally you should use the latest release for each data set.  For
+# Latin-ASCII.xml, the latest data sets released can be browsed directly
+# via [3].  Note that this script is compatible with at least release 29.
+#
  # [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
-# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
+# [2] http://unicode.org/cldr/trac/export/14746/tags/release-34/common/transforms/Latin-ASCII.xml
+# [3] https://unicode.org/cldr/trac/browser/tags
  
  # BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
  # The approach is to be Python3 compatible with Python2 "backports".
@@ -140,8 +145,18 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
      transliterationTree = ET.parse(latinAsciiFilePath)
      transliterationTreeRoot = transliterationTree.getroot()
  
-    for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"):
-        matches = rulePattern.search(rule.text)
+    # Fetch all the transliteration rules.  Since release 29 of Latin-ASCII.xml
+    # all the transliteration rules are located in a single tRule block with
+    # all rules separated into separate lines.
+    blockRules = transliterationTreeRoot.findall("./transforms/transform/tRule")
+    assert(len(blockRules) == 1)
+
+    # Split the block of rules into one element per line.
+    rules = blockRules[0].text.splitlines()
+
+    # And finish the processing of each individual rule.
+    for rule in rules:
+        matches = rulePattern.search(rule)
  
          # The regular expression capture four groups corresponding
          # to the characters.
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql

index ba72ab6261c09a9a446da468756942d06159e7cd..c671827caa55a634bfa9aa5752108433636b1a7e 100644 (file)
--- a/contrib/unaccent/sql/unaccent.sql
+++ b/contrib/unaccent/sql/unaccent.sql
@@ -8,11 +8,14 @@ SET client_encoding TO 'UTF8';
  SELECT unaccent('foobar');
  SELECT unaccent('ёлка');
  SELECT unaccent('ЁЖИК');
+SELECT unaccent('˃˖˗˜');
  
  SELECT unaccent('unaccent', 'foobar');
  SELECT unaccent('unaccent', 'ёлка');
  SELECT unaccent('unaccent', 'ЁЖИК');
+SELECT unaccent('unaccent', '˃˖˗˜');
  
  SELECT ts_lexize('unaccent', 'foobar');
  SELECT ts_lexize('unaccent', 'ёлка');
  SELECT ts_lexize('unaccent', 'ЁЖИК');
+SELECT ts_lexize('unaccent', '˃˖˗˜');
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules

index 76e4e69bebb8c8d4c053a81a1114a773275804a5..7ce25eef03d61668e0d631ed0cb1d4391fc402a9 100644 (file)
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -399,6 +399,21 @@
  ʦ     ts
  ʪ     ls
  ʫ     lz
+ʹ     '
+ʺ     "
+ʻ     '
+ʼ     '
+ʽ     '
+˂     <
+˃     >
+˄     ^
+ˆ     ^
+ˈ     '
+ˋ     `
+ː     :
+˖     +
+˗     -
+˜     ~
  Ά     Α
  Έ     Ε
  Ή     Η
author	Michael Paquier <michael@paquier.xyz>
	Thu, 10 Jan 2019 05:10:21 +0000 (14:10 +0900)
committer	Michael Paquier <michael@paquier.xyz>
	Thu, 10 Jan 2019 05:10:21 +0000 (14:10 +0900)
contrib/unaccent/expected/unaccent.out		patch \| blob \| history
contrib/unaccent/generate_unaccent_rules.py		patch \| blob \| history
contrib/unaccent/sql/unaccent.sql		patch \| blob \| history
contrib/unaccent/unaccent.rules		patch \| blob \| history