Consider \U-escapes in raw-unicode-escape. Fixes #444514.

author Martin v. Löwis <martin@v.loewis.de>

Sun, 18 May 2003 12:31:09 +0000 (12:31 +0000)

committer Martin v. Löwis <martin@v.loewis.de>

Sun, 18 May 2003 12:31:09 +0000 (12:31 +0000)
author Martin v. Löwis <martin@v.loewis.de>
Sun, 18 May 2003 12:31:09 +0000 (12:31 +0000)
committer Martin v. Löwis <martin@v.loewis.de>
Sun, 18 May 2003 12:31:09 +0000 (12:31 +0000)
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py

index 9250ef2605ae833718c36cc888a769e51a5d369f..8e1f0b1b54924e355501025bf736305623e0e37b 100644 (file)
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -697,6 +697,13 @@ class UnicodeTest(
          print >>out, u'def\n'
          print >>out, u'def\n'
  
+    def test_ucs4(self):
+        if sys.maxunicode == 0xFFFF:
+            return
+        x = u'\U00100000'
+        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
+        self.assertEqual(x, y)
+
  def test_main():
      test_support.run_unittest(UnicodeTest)
  
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 096dfcb7c9afc39059129eb765bca180438f3b8e..94c67c84df3b7d5f77a2e73aa374623053bb7d3a 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2030,6 +2030,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
         unsigned char c;
         Py_UCS4 x;
         int i;
+        int count;
  
         /* Non-escape characters are interpreted as Unicode ordinals */
         if (*s != '\\') {
@@ -2048,15 +2049,16 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
         }
         if (((s - bs) & 1) == 0 ||
             s >= end ||
-           *s != 'u') {
+           (*s != 'u' && *s != 'U')) {
             continue;
         }
         p--;
+        count = *s=='u' ? 4 : 8;
         s++;
  
-       /* \uXXXX with 4 hex digits */
+       /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
         outpos = p-PyUnicode_AS_UNICODE(v);
-       for (x = 0, i = 0; i < 4; ++i, ++s) {
+       for (x = 0, i = 0; i < count; ++i, ++s) {
             c = (unsigned char)*s;
             if (!isxdigit(c)) {
                 endinpos = s-starts;
@@ -2076,6 +2078,16 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
             else
                 x += 10 + c - 'A';
         }
+#ifndef Py_UNICODE_WIDE
+        if (x > 0x10000) {
+            if (unicode_decode_call_errorhandler(
+                    errors, &errorHandler,
+                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
+                   starts, size, &startinpos, &endinpos, &exc, &s,
+                   (PyObject **)&v, &outpos, &p))
+                   goto onError;
+        }
+#endif
         *p++ = x;
         nextByte:
         ;
@@ -2102,7 +2114,11 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
  
      static const char *hexdigit = "0123456789abcdef";
  
+#ifdef Py_UNICODE_WIDE
+    repr = PyString_FromStringAndSize(NULL, 10 * size);
+#else
      repr = PyString_FromStringAndSize(NULL, 6 * size);
+#endif
      if (repr == NULL)
          return NULL;
      if (size == 0)
@@ -2111,6 +2127,22 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
      p = q = PyString_AS_STRING(repr);
      while (size-- > 0) {
          Py_UNICODE ch = *s++;
+#ifdef Py_UNICODE_WIDE
+       /* Map 32-bit characters to '\Uxxxxxxxx' */
+       if (ch >= 0x10000) {
+            *p++ = '\\';
+            *p++ = 'U';
+            *p++ = hexdigit[(ch >> 28) & 0xf];
+            *p++ = hexdigit[(ch >> 24) & 0xf];
+            *p++ = hexdigit[(ch >> 20) & 0xf];
+            *p++ = hexdigit[(ch >> 16) & 0xf];
+            *p++ = hexdigit[(ch >> 12) & 0xf];
+            *p++ = hexdigit[(ch >> 8) & 0xf];
+            *p++ = hexdigit[(ch >> 4) & 0xf];
+            *p++ = hexdigit[ch & 15];
+        } 
+        else
+#endif
         /* Map 16-bit characters to '\uxxxx' */
         if (ch >= 256) {
              *p++ = '\\';
@@ -6769,3 +6801,10 @@ _PyUnicode_Fini(void)
      unicode_freelist = NULL;
      unicode_freelist_size = 0;
  }
+
+/*
+Local variables:
+c-basic-offset: 4
+indent-tabs-mode: nil
+End:
+*/
author	Martin v. Löwis <martin@v.loewis.de>
	Sun, 18 May 2003 12:31:09 +0000 (12:31 +0000)
committer	Martin v. Löwis <martin@v.loewis.de>
	Sun, 18 May 2003 12:31:09 +0000 (12:31 +0000)
Lib/test/test_unicode.py		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history