bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 (GH-1958)

author Wonsup Yoon <pusnow@me.com>

Fri, 15 Jun 2018 12:03:14 +0000 (21:03 +0900)

committer Xiang Zhang <angwerzx@126.com>

Fri, 15 Jun 2018 12:03:14 +0000 (20:03 +0800)
author Wonsup Yoon <pusnow@me.com>
Fri, 15 Jun 2018 12:03:14 +0000 (21:03 +0900)
committer Xiang Zhang <angwerzx@126.com>
Fri, 15 Jun 2018 12:03:14 +0000 (20:03 +0800)
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py

index 99dd0dec9d11d68dd7a691ce630b1a184ee2b7cb..170778fa977db1bad47331157b37c7fcb12fded7 100644 (file)
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -208,6 +208,19 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
          b = 'C\u0338' * 20  + '\xC7'
          self.assertEqual(self.db.normalize('NFC', a), b)
  
+    def test_issue29456(self):
+        # Fix #29456
+        u1176_str_a = '\u1100\u1176\u11a8'
+        u1176_str_b = '\u1100\u1176\u11a8'
+        u11a7_str_a = '\u1100\u1175\u11a7'
+        u11a7_str_b = '\uae30\u11a7'
+        u11c3_str_a = '\u1100\u1175\u11c3'
+        u11c3_str_b = '\uae30\u11c3'
+        self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
+        self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
+        self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
+
+
      def test_east_asian_width(self):
          eaw = self.db.east_asian_width
          self.assertRaises(TypeError, eaw, b'a')
diff --git a/Misc/ACKS b/Misc/ACKS

index 96aad5073a3f87cd139ec4faed16a57c0ad19dc9..25d1db078105bf14fbbf84079fb802b7373016d6 100644 (file)
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -1800,6 +1800,7 @@ Jason Yeo
  EungJun Yi
  Bob Yodlowski
  Danny Yoo
+Wonsup Yoon
  Rory Yorke
  George Yoshida
  Kazuhiro Yoshida
diff --git a/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst b/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst

new file mode 100644 (file)

index 0000000..9b30bf6
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst
@@ -0,0 +1 @@
+Fix bugs in hangul normalization: u1176, u11a7 and u11c3
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c

index 7a9a964a0f6cf0606a1b56587a5efea960442e21..e8788f5036ddefee37f9996fb03454973753e9ed 100644 (file)
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -681,15 +681,19 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
        if (LBase <= code && code < (LBase+LCount) &&
            i + 1 < len &&
            VBase <= PyUnicode_READ(kind, data, i+1) &&
-          PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
+          PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
+          /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
+             and V character is a modern vowel (0x1161 ~ 0x1175). */
            int LIndex, VIndex;
            LIndex = code - LBase;
            VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
            code = SBase + (LIndex*VCount+VIndex)*TCount;
            i+=2;
            if (i < len &&
-              TBase <= PyUnicode_READ(kind, data, i) &&
-              PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
+              TBase < PyUnicode_READ(kind, data, i) &&
+              PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
+              /* check T character is a modern trailing consonant
+                 (0x11A8 ~ 0x11C2). */
                code += PyUnicode_READ(kind, data, i)-TBase;
                i++;
            }
author	Wonsup Yoon <pusnow@me.com>
	Fri, 15 Jun 2018 12:03:14 +0000 (21:03 +0900)
committer	Xiang Zhang <angwerzx@126.com>
	Fri, 15 Jun 2018 12:03:14 +0000 (20:03 +0800)
Lib/test/test_unicodedata.py		patch \| blob \| history
Misc/ACKS		patch \| blob \| history
Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst	[new file with mode: 0644]	patch \| blob
Modules/unicodedata.c		patch \| blob \| history