Do not insert characters for unicode-escape decoders if the error mode

author Martin v. Löwis <martin@v.loewis.de>

Thu, 21 Mar 2002 08:55:28 +0000 (08:55 +0000)

committer Martin v. Löwis <martin@v.loewis.de>

Thu, 21 Mar 2002 08:55:28 +0000 (08:55 +0000)
author Martin v. Löwis <martin@v.loewis.de>
Thu, 21 Mar 2002 08:55:28 +0000 (08:55 +0000)
committer Martin v. Löwis <martin@v.loewis.de>
Thu, 21 Mar 2002 08:55:28 +0000 (08:55 +0000)
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py

index 56f18112623c13aeb394e03b5b994b8426a43b55..ad47f31b7da0694c8533a775cefca3b447e888fc 100644 (file)
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -541,6 +541,14 @@ else:
  verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
  verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
  
+verify("\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx")
+try:
+    "\\".decode("unicode-escape")
+except ValueError:
+    pass
+else:
+    raise TestFailed, '"\\".decode("unicode-escape") should fail'
+
  verify(u'hello'.encode('ascii') == 'hello')
  verify(u'hello'.encode('utf-7') == 'hello')
  verify(u'hello'.encode('utf-8') == 'hello')
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index c318bd6437d8ca005d0ad8bdbfc9a82e80589c4b..978ac54c923d2dbd37af3d2831e2759439bbf030 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1514,8 +1514,7 @@ PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
  /* --- Unicode Escape Codec ----------------------------------------------- */
  
  static
-int unicodeescape_decoding_error(const char **source,
-                                 Py_UNICODE *x,
+int unicodeescape_decoding_error(Py_UNICODE **x,
                                   const char *errors,
                                   const char *details) 
  {
@@ -1530,7 +1529,8 @@ int unicodeescape_decoding_error(const char **source,
          return 0;
      }
      else if (strcmp(errors,"replace") == 0) {
-        *x = Py_UNICODE_REPLACEMENT_CHARACTER;
+        **x = Py_UNICODE_REPLACEMENT_CHARACTER;
+       (*x)++;
          return 0;
      }
      else {
@@ -1628,9 +1628,9 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
              for (i = 0; i < digits; i++) {
                  c = (unsigned char) s[i];
                  if (!isxdigit(c)) {
-                    if (unicodeescape_decoding_error(&s, &x, errors, message))
+                    if (unicodeescape_decoding_error(&p, errors, message))
                          goto onError;
-                    chr = x;
+                    chr = 0xffffffff;
                      i++;
                      break;
                  }
@@ -1643,6 +1643,10 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                      chr += 10 + c - 'A';
              }
              s += i;
+           if (chr == 0xffffffff)
+                   /* _decoding_error will have already written into the
+                      target buffer. */
+                   break;
          store:
              /* when we get here, chr is a 32-bit unicode character */
              if (chr <= 0xffff)
@@ -1660,11 +1664,10 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
  #endif
              } else {
                  if (unicodeescape_decoding_error(
-                    &s, &x, errors,
+                    &p, errors,
                      "illegal Unicode character")
                      )
                      goto onError;
-                *p++ = x; /* store replacement character */
              }
              break;
  
@@ -1699,14 +1702,19 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                          goto store;
                  }
              }
-            if (unicodeescape_decoding_error(&s, &x, errors, message))
+            if (unicodeescape_decoding_error(&p, errors, message))
                  goto onError;
-            *p++ = x;
              break;
  
          default:
-            *p++ = '\\';
-            *p++ = (unsigned char)s[-1];
+           if (s > end) {
+               if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
+                   goto onError;
+           }
+           else {
+               *p++ = '\\';
+               *p++ = (unsigned char)s[-1];
+           }
              break;
          }
      }
@@ -1909,7 +1917,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
      end = s + size;
      while (s < end) {
         unsigned char c;
-       Py_UNICODE x;
+       Py_UCS4 x;
         int i;
  
         /* Non-escape characters are interpreted as Unicode ordinals */
@@ -1938,9 +1946,10 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
         for (x = 0, i = 0; i < 4; i++) {
             c = (unsigned char)s[i];
             if (!isxdigit(c)) {
-               if (unicodeescape_decoding_error(&s, &x, errors,
+               if (unicodeescape_decoding_error(&p, errors,
                                                  "truncated \\uXXXX"))
                     goto onError;
+               x = 0xffffffff;
                 i++;
                 break;
             }
@@ -1953,7 +1962,8 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
                 x += 10 + c - 'A';
         }
         s += i;
-       *p++ = x;
+       if (x != 0xffffffff)
+               *p++ = x;
      }
      if (_PyUnicode_Resize(&v, (int)(p - buf)))
         goto onError;
author	Martin v. Löwis <martin@v.loewis.de>
	Thu, 21 Mar 2002 08:55:28 +0000 (08:55 +0000)
committer	Martin v. Löwis <martin@v.loewis.de>
	Thu, 21 Mar 2002 08:55:28 +0000 (08:55 +0000)
Lib/test/test_unicode.py		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history