in narrow builds, make sure to test codepoints as identifier characters (closes ...

author Benjamin Peterson <benjamin@python.org>

Sat, 13 Aug 2011 03:17:18 +0000 (22:17 -0500)

committer Benjamin Peterson <benjamin@python.org>

Sat, 13 Aug 2011 03:17:18 +0000 (22:17 -0500)
author Benjamin Peterson <benjamin@python.org>
Sat, 13 Aug 2011 03:17:18 +0000 (22:17 -0500)
committer Benjamin Peterson <benjamin@python.org>
Sat, 13 Aug 2011 03:17:18 +0000 (22:17 -0500)
diff --git a/Lib/test/test_pep3131.py b/Lib/test/test_pep3131.py

index 9d5f217165f0d89b3a27f29b09bf8439f97faa1d..ed7558a7eec7ba1a9e42c1b5870d70ae74cc75a9 100644 (file)
--- a/Lib/test/test_pep3131.py
+++ b/Lib/test/test_pep3131.py
@@ -8,9 +8,12 @@ class PEP3131Test(unittest.TestCase):
              ä = 1
              µ = 2 # this is a compatibility character
              蟒 = 3
+            𝔘𝔫𝔦𝔠𝔬𝔡𝔢  = 4
          self.assertEqual(getattr(T, "\xe4"), 1)
          self.assertEqual(getattr(T, "\u03bc"), 2)
          self.assertEqual(getattr(T, '\u87d2'), 3)
+        v = getattr(T, "\U0001d518\U0001d52b\U0001d526\U0001d520\U0001d52c\U0001d521\U0001d522")
+        self.assertEqual(v, 4)
  
      def test_invalid(self):
          try:
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py

index 55aaba6d3b48f8c1ec62970a6bb0cd02f308557a..09cf48f10b2dca8b96fa5115d0429f191112373a 100644 (file)
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -404,6 +404,7 @@ class UnicodeTest(string_tests.CommonTest,
          self.assertTrue("bc".isidentifier())
          self.assertTrue("b_".isidentifier())
          self.assertTrue("µ".isidentifier())
+        self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
  
          self.assertFalse(" ".isidentifier())
          self.assertFalse("[".isidentifier())
diff --git a/Misc/NEWS b/Misc/NEWS

index c9a0522cfc7f1655f8210dfd679216636ae4600d..354d09a070c10cbbdd2fade41f78ba83e6d5592c 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@ What's New in Python 3.2.2?
  Core and Builtins
  -----------------
  
+- Issue #12732: In narrow unicode builds, allow Unicode identifiers which fall
+  outside the BMP.
+
  - Issue #11603: Fix a crash when __str__ is rebound as __repr__.  Patch by
    Andreas Stührk.
  
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 75da0e7e8d28da40a5503205d250cae10b0a5ea5..a48b8b41b1348d419b60d39ab93af79cf348a5ab 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -7972,14 +7972,30 @@ unicode_isnumeric(PyUnicodeObject *self)
      return PyBool_FromLong(1);
  }
  
+static Py_UCS4
+decode_ucs4(const Py_UNICODE *s, Py_ssize_t *i, Py_ssize_t size)
+{
+    Py_UCS4 ch;
+    assert(*i < size);
+    ch = s[(*i)++];
+#ifndef Py_UNICODE_WIDE
+    if ((ch & 0xfffffc00) == 0xd800 &&
+        *i < size
+        && (s[*i] & 0xFFFFFC00) == 0xDC00)
+        ch = ((Py_UCS4)ch << 10UL) + (Py_UCS4)(s[(*i)++]) - 0x35fdc00;
+#endif
+    return ch;
+}
+
  int
  PyUnicode_IsIdentifier(PyObject *self)
  {
-    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
-    register const Py_UNICODE *e;
+    Py_ssize_t i = 0, size = PyUnicode_GET_SIZE(self);
+    Py_UCS4 first;
+    const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
  
      /* Special case for empty strings */
-    if (PyUnicode_GET_SIZE(self) == 0)
+    if (!size)
          return 0;
  
      /* PEP 3131 says that the first character must be in
@@ -7990,14 +8006,13 @@ PyUnicode_IsIdentifier(PyObject *self)
         definition of XID_Start and XID_Continue, it is sufficient
         to check just for these, except that _ must be allowed
         as starting an identifier.  */
-    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
+    first = decode_ucs4(p, &i, size);
+    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
          return 0;
  
-    e = p + PyUnicode_GET_SIZE(self);
-    for (p++; p < e; p++) {
-        if (!_PyUnicode_IsXidContinue(*p))
+    while (i < size)
+        if (!_PyUnicode_IsXidContinue(decode_ucs4(p, &i, size)))
              return 0;
-    }
      return 1;
  }
author	Benjamin Peterson <benjamin@python.org>
	Sat, 13 Aug 2011 03:17:18 +0000 (22:17 -0500)
committer	Benjamin Peterson <benjamin@python.org>
	Sat, 13 Aug 2011 03:17:18 +0000 (22:17 -0500)
Lib/test/test_pep3131.py		patch \| blob \| history
Lib/test/test_unicode.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history