#include "ucnhash.h"
#include "structmember.h"
+#include <stdbool.h>
+
_Py_IDENTIFIER(NFC);
_Py_IDENTIFIER(NFD);
_Py_IDENTIFIER(NFKC);
return result;
}
-typedef enum {YES, NO, MAYBE} NormalMode;
-
-/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
-static NormalMode
-is_normalized(PyObject *self, PyObject *input, int nfc, int k)
+// This needs to match the logic in makeunicodedata.py
+// which constructs the quickcheck data.
+typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
+
+/* Run the Unicode normalization "quickcheck" algorithm.
+ *
+ * Return YES or NO if quickcheck determines the input is certainly
+ * normalized or certainly not, and MAYBE if quickcheck is unable to
+ * tell.
+ *
+ * If `yes_only` is true, then return MAYBE as soon as we determine
+ * the answer is not YES.
+ *
+ * For background and details on the algorithm, see UAX #15:
+ * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
+ */
+static QuickcheckResult
+is_normalized_quickcheck(PyObject *self, PyObject *input,
+ int nfc, int k, bool yes_only)
{
- Py_ssize_t i, len;
- int kind;
- void *data;
- unsigned char prev_combining = 0, quickcheck_mask;
-
/* An older version of the database is requested, quickchecks must be
disabled. */
if (self && UCD_Check(self))
return NO;
- /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
- as described in http://unicode.org/reports/tr15/#Annex8. */
- quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
+ Py_ssize_t i, len;
+ int kind;
+ void *data;
+ unsigned char prev_combining = 0;
+
+ /* The two quickcheck bits at this shift have type QuickcheckResult. */
+ int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
+
+ QuickcheckResult result = YES; /* certainly normalized, unless we find something */
i = 0;
kind = PyUnicode_KIND(input);
while (i < len) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
- unsigned char combining = record->combining;
- unsigned char quickcheck = record->normalization_quick_check;
- if (quickcheck & quickcheck_mask)
- return MAYBE; /* this string might need normalization */
+ unsigned char combining = record->combining;
if (combining && prev_combining > combining)
return NO; /* non-canonical sort order, not normalized */
prev_combining = combining;
+
+ unsigned char quickcheck_whole = record->normalization_quick_check;
+ if (yes_only) {
+ if (quickcheck_whole & (3 << quickcheck_shift))
+ return MAYBE;
+ } else {
+ switch ((quickcheck_whole >> quickcheck_shift) & 3) {
+ case NO:
+ return NO;
+ case MAYBE:
+ result = MAYBE; /* this string might need normalization */
+ }
+ }
}
- return YES; /* certainly normalized */
+ return result;
}
/*[clinic input]
PyObject *result;
int nfc = 0;
int k = 0;
- NormalMode m;
+ QuickcheckResult m;
PyObject *cmp;
int match = 0;
return NULL;
}
- m = is_normalized(self, input, nfc, k);
+ m = is_normalized_quickcheck(self, input, nfc, k, false);
if (m == MAYBE) {
cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
}
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
- if (is_normalized(self, input, 1, 0) == YES) {
+ if (is_normalized_quickcheck(self, input, 1, 0, true) == YES) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 0);
}
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
- if (is_normalized(self, input, 1, 1) == YES) {
+ if (is_normalized_quickcheck(self, input, 1, 1, true) == YES) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 1);
}
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
- if (is_normalized(self, input, 0, 0) == YES) {
+ if (is_normalized_quickcheck(self, input, 0, 0, true) == YES) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(self, input, 0);
}
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
- if (is_normalized(self, input, 0, 1) == YES) {
+ if (is_normalized_quickcheck(self, input, 0, 1, true) == YES) {
Py_INCREF(input);
return input;
}