closes bpo-37966: Fully implement the UAX #15 quick-check algorithm. (GH-15558)

author Greg Price <gnprice@gmail.com>

Wed, 4 Sep 2019 02:45:44 +0000 (19:45 -0700)

committer Benjamin Peterson <benjamin@python.org>

Wed, 4 Sep 2019 02:45:44 +0000 (19:45 -0700)
author Greg Price <gnprice@gmail.com>
Wed, 4 Sep 2019 02:45:44 +0000 (19:45 -0700)
committer Benjamin Peterson <benjamin@python.org>
Wed, 4 Sep 2019 02:45:44 +0000 (19:45 -0700)
diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst

index bcdb60d86d855346eadb1af090b4e9014cd292c0..4a1362d943c8098cda3a807a2ef64003dcd8dd5b 100644 (file)
--- a/Doc/whatsnew/3.8.rst
+++ b/Doc/whatsnew/3.8.rst
@@ -1090,8 +1090,9 @@ unicodedata
    <http://blog.unicode.org/2019/05/unicode-12-1-en.html>`_ release.
  
  * New function :func:`~unicodedata.is_normalized` can be used to verify a string
-  is in a specific normal form. (Contributed by Max Belanger and David Euresti in
-  :issue:`32285`).
+  is in a specific normal form, often much faster than by actually normalizing
+  the string.  (Contributed by Max Belanger, David Euresti, and Greg Price in
+  :issue:`32285` and :issue:`37966`).
  
  
  unittest
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py

index 7bc196be362b324effef02c77da68a362991d39c..9ec2f11497268a96c83362ee92b4674fe05b67be 100644 (file)
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -208,6 +208,8 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
          self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
          self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
  
+    # For tests of unicodedata.is_normalized / self.db.is_normalized ,
+    # see test_normalization.py .
  
      def test_east_asian_width(self):
          eaw = self.db.east_asian_width
diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst b/Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst

new file mode 100644 (file)

index 0000000..6b9d69c
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst
@@ -0,0 +1,3 @@
+The implementation of :func:`~unicodedata.is_normalized` has been greatly
+sped up on strings that aren't normalized, by implementing the full
+normalization-quick-check algorithm from the Unicode standard.
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c

index ae0d4e46f9a409df9c0727787fbee3e8678300dd..5e8ba602d6684825f2f97497ef8c310fe781fd68 100644 (file)
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -19,6 +19,8 @@
  #include "ucnhash.h"
  #include "structmember.h"
  
+#include <stdbool.h>
+
  _Py_IDENTIFIER(NFC);
  _Py_IDENTIFIER(NFD);
  _Py_IDENTIFIER(NFKC);
@@ -775,25 +777,40 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
      return result;
  }
  
-typedef enum {YES, NO, MAYBE} NormalMode;
-
-/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
-static NormalMode
-is_normalized(PyObject *self, PyObject *input, int nfc, int k)
+// This needs to match the logic in makeunicodedata.py
+// which constructs the quickcheck data.
+typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
+
+/* Run the Unicode normalization "quickcheck" algorithm.
+ *
+ * Return YES or NO if quickcheck determines the input is certainly
+ * normalized or certainly not, and MAYBE if quickcheck is unable to
+ * tell.
+ *
+ * If `yes_only` is true, then return MAYBE as soon as we determine
+ * the answer is not YES.
+ *
+ * For background and details on the algorithm, see UAX #15:
+ *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
+ */
+static QuickcheckResult
+is_normalized_quickcheck(PyObject *self, PyObject *input,
+                         int nfc, int k, bool yes_only)
  {
-    Py_ssize_t i, len;
-    int kind;
-    void *data;
-    unsigned char prev_combining = 0, quickcheck_mask;
-
      /* An older version of the database is requested, quickchecks must be
         disabled. */
      if (self && UCD_Check(self))
          return NO;
  
-    /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
-       as described in http://unicode.org/reports/tr15/#Annex8. */
-    quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
+    Py_ssize_t i, len;
+    int kind;
+    void *data;
+    unsigned char prev_combining = 0;
+
+    /* The two quickcheck bits at this shift have type QuickcheckResult. */
+    int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
+
+    QuickcheckResult result = YES; /* certainly normalized, unless we find something */
  
      i = 0;
      kind = PyUnicode_KIND(input);
@@ -802,16 +819,26 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
      while (i < len) {
          Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
          const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
-        unsigned char combining = record->combining;
-        unsigned char quickcheck = record->normalization_quick_check;
  
-        if (quickcheck & quickcheck_mask)
-            return MAYBE; /* this string might need normalization */
+        unsigned char combining = record->combining;
          if (combining && prev_combining > combining)
              return NO; /* non-canonical sort order, not normalized */
          prev_combining = combining;
+
+        unsigned char quickcheck_whole = record->normalization_quick_check;
+        if (yes_only) {
+            if (quickcheck_whole & (3 << quickcheck_shift))
+                return MAYBE;
+        } else {
+            switch ((quickcheck_whole >> quickcheck_shift) & 3) {
+            case NO:
+              return NO;
+            case MAYBE:
+              result = MAYBE; /* this string might need normalization */
+            }
+        }
      }
-    return YES; /* certainly normalized */
+    return result;
  }
  
  /*[clinic input]
@@ -844,7 +871,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
      PyObject *result;
      int nfc = 0;
      int k = 0;
-    NormalMode m;
+    QuickcheckResult m;
  
      PyObject *cmp;
      int match = 0;
@@ -867,7 +894,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
          return NULL;
      }
  
-    m = is_normalized(self, input, nfc, k);
+    m = is_normalized_quickcheck(self, input, nfc, k, false);
  
      if (m == MAYBE) {
          cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
@@ -913,28 +940,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
      }
  
      if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
-        if (is_normalized(self, input, 1, 0) == YES) {
+        if (is_normalized_quickcheck(self, input, 1, 0, true) == YES) {
              Py_INCREF(input);
              return input;
          }
          return nfc_nfkc(self, input, 0);
      }
      if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
-        if (is_normalized(self, input, 1, 1) == YES) {
+        if (is_normalized_quickcheck(self, input, 1, 1, true) == YES) {
              Py_INCREF(input);
              return input;
          }
          return nfc_nfkc(self, input, 1);
      }
      if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
-        if (is_normalized(self, input, 0, 0) == YES) {
+        if (is_normalized_quickcheck(self, input, 0, 0, true) == YES) {
              Py_INCREF(input);
              return input;
          }
          return nfd_nfkd(self, input, 0);
      }
      if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
-        if (is_normalized(self, input, 0, 1) == YES) {
+        if (is_normalized_quickcheck(self, input, 0, 1, true) == YES) {
              Py_INCREF(input);
              return input;
          }
author	Greg Price <gnprice@gmail.com>
	Wed, 4 Sep 2019 02:45:44 +0000 (19:45 -0700)
committer	Benjamin Peterson <benjamin@python.org>
	Wed, 4 Sep 2019 02:45:44 +0000 (19:45 -0700)
Doc/whatsnew/3.8.rst		patch \| blob \| history
Lib/test/test_unicodedata.py		patch \| blob \| history
Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst	[new file with mode: 0644]	patch \| blob
Modules/unicodedata.c		patch \| blob \| history