reorganized PyUnicode_DecodeUnicodeEscape a bit (in order to make it

author Fredrik Lundh <fredrik@pythonware.com>

Sun, 18 Feb 2001 22:13:49 +0000 (22:13 +0000)

committer Fredrik Lundh <fredrik@pythonware.com>

Sun, 18 Feb 2001 22:13:49 +0000 (22:13 +0000)
author Fredrik Lundh <fredrik@pythonware.com>
Sun, 18 Feb 2001 22:13:49 +0000 (22:13 +0000)
committer Fredrik Lundh <fredrik@pythonware.com>
Sun, 18 Feb 2001 22:13:49 +0000 (22:13 +0000)
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 7b12594f72f4b89df5d7a9b7416b5a0247ec8bf6..c237789a79edfef77e4ecb224b24e7781dc2f263 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1110,10 +1110,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                                         const char *errors)
  {
      PyUnicodeObject *v;
-    Py_UNICODE *p = NULL, *buf = NULL;
+    Py_UNICODE *p, *buf;
      const char *end;
-    Py_UCS4 chr;
-    
+    char* message;
+    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
+
      /* Escaped strings will always be longer than the resulting
         Unicode string, so we start with size here and then reduce the
         length after conversion to the true value. */
@@ -1122,16 +1123,18 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
          goto onError;
      if (size == 0)
          return (PyObject *)v;
+
      p = buf = PyUnicode_AS_UNICODE(v);
      end = s + size;
+
      while (s < end) {
          unsigned char c;
          Py_UNICODE x;
-        int i;
+        int i, digits;
  
          /* Non-escape characters are interpreted as Unicode ordinals */
          if (*s != '\\') {
-            *p++ = (unsigned char)*s++;
+            *p++ = (unsigned char) *s++;
              continue;
          }
  
@@ -1164,60 +1167,31 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
              *p++ = x;
              break;
  
-        /* \xXX with two hex digits */
+        /* hex escapes */
+        /* \xXX */
          case 'x':
-            for (x = 0, i = 0; i < 2; i++) {
-                c = (unsigned char)s[i];
-                if (!isxdigit(c)) {
-                    if (unicodeescape_decoding_error(&s, &x, errors,
-                                                     "truncated \\xXX"))
-                        goto onError;
-                    i++;
-                    break;
-                }
-                x = (x<<4) & ~0xF;
-                if (c >= '0' && c <= '9')
-                    x += c - '0';
-                else if (c >= 'a' && c <= 'f')
-                    x += 10 + c - 'a';
-                else
-                    x += 10 + c - 'A';
-            }
-            s += i;
-            *p++ = x;
-            break;
+            digits = 2;
+            message = "truncated \\xXX escape";
+            goto hexescape;
  
-        /* \uXXXX with 4 hex digits */
+        /* \uXXXX */
          case 'u':
-            for (x = 0, i = 0; i < 4; i++) {
-                c = (unsigned char)s[i];
-                if (!isxdigit(c)) {
-                    if (unicodeescape_decoding_error(&s, &x, errors,
-                                                     "truncated \\uXXXX"))
-                        goto onError;
-                    i++;
-                    break;
-                }
-                x = (x<<4) & ~0xF;
-                if (c >= '0' && c <= '9')
-                    x += c - '0';
-                else if (c >= 'a' && c <= 'f')
-                    x += 10 + c - 'a';
-                else
-                    x += 10 + c - 'A';
-            }
-            s += i;
-            *p++ = x;
-            break;
+            digits = 4;
+            message = "truncated \\uXXXX escape";
+            goto hexescape;
  
-        /* \UXXXXXXXX with 8 hex digits */
+        /* \UXXXXXXXX */
          case 'U':
-            for (chr = 0, i = 0; i < 8; i++) {
-                c = (unsigned char)s[i];
+            digits = 8;
+            message = "truncated \\UXXXXXXXX escape";
+        hexescape:
+            chr = 0;
+            for (i = 0; i < digits; i++) {
+                c = (unsigned char) s[i];
                  if (!isxdigit(c)) {
-                    if (unicodeescape_decoding_error(&s, &x, errors,
-                                                     "truncated \\uXXXX"))
+                    if (unicodeescape_decoding_error(&s, &x, errors, message))
                          goto onError;
+                    chr = x;
                      i++;
                      break;
                  }
@@ -1230,19 +1204,37 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                      chr += 10 + c - 'A';
              }
              s += i;
-            goto store;
+        store:
+            /* when we get here, chr is a 32-bit unicode character */
+            if (chr <= 0xffff)
+                /* UCS-2 character */
+                *p++ = (Py_UNICODE) chr;
+            else if (chr <= 0x10ffff) {
+                /* UCS-4 character.  store as two surrogate characters */
+                chr -= 0x10000L;
+                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
+                *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
+            } else {
+                if (unicodeescape_decoding_error(
+                    &s, &x, errors,
+                    "illegal Unicode character")
+                    )
+                    goto onError;
+                *p++ = x; /* store replacement character */
+            }
+            break;
  
+        /* \N{name} */
          case 'N':
-            /* Ok, we need to deal with Unicode Character Names now,
-             * make sure we've imported the hash table data...
-             */
+            message = "malformed \\N character escape";
              if (ucnhash_CAPI == NULL) {
-                PyObject *mod = 0, *v = 0;
-                mod = PyImport_ImportModule("unicodedata");
-                if (mod == NULL)
+                /* load the unicode data module */
+                PyObject *m, *v;
+                m = PyImport_ImportModule("unicodedata");
+                if (m == NULL)
                      goto ucnhashError;
-                v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
-                Py_DECREF(mod);
+                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
+                Py_DECREF(m);
                  if (v == NULL)
                      goto ucnhashError;
                  ucnhash_CAPI = PyCObject_AsVoidPtr(v);
@@ -1250,75 +1242,42 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                  if (ucnhash_CAPI == NULL)
                      goto ucnhashError;
              }
-                
              if (*s == '{') {
-                const char *start = s + 1;
-                const char *endBrace = start;
-
+                const char *start = s+1;
                  /* look for the closing brace */
-                while (*endBrace != '}' && endBrace < end)
-                    endBrace++;
-                if (endBrace != end && *endBrace == '}') {
-                    if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
-                        if (unicodeescape_decoding_error(
-                                &s, &x, errors,
-                                "Invalid Unicode Character Name")
-                            )
-                            goto onError;
-                        goto ucnFallthrough;
-                    }
-                    s = endBrace + 1;
-                    goto store;
-                } else {
-                    if (unicodeescape_decoding_error(
-                            &s, &x, errors,
-                            "Unicode name missing closing brace"))
-                        goto onError;
-                    goto ucnFallthrough;
+                while (*s != '}' && s < end)
+                    s++;
+                if (s > start && s < end && *s == '}') {
+                    /* found a name.  look it up in the unicode database */
+                    message = "unknown Unicode character name";
+                    s++;
+                    if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
+                        goto store;
                  }
-                break;                
              }
-            if (unicodeescape_decoding_error(
-                    &s, &x, errors,
-                    "Missing opening brace for Unicode Character Name escape"))
+            if (unicodeescape_decoding_error(&s, &x, errors, message))
                  goto onError;
-ucnFallthrough:
-            /* fall through on purpose */
-               default:
+            *p++ = x;
+            break;
+
+        default:
              *p++ = '\\';
              *p++ = (unsigned char)s[-1];
              break;
-store:
-            /* when we get here, chr is a 32-bit unicode character */
-            if (chr <= 0xffff)
-                /* UCS-2 character */
-                *p++ = (Py_UNICODE) chr;
-            else if (chr <= 0x10ffff) {
-                /* UCS-4 character.  store as two surrogate characters */
-                chr -= 0x10000L;
-                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
-                *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
-            } else {
-                if (unicodeescape_decoding_error(
-                    &s, &x, errors,
-                    "Illegal Unicode character")
-                    )
-                    goto onError;
-            }
          }
      }
      if (_PyUnicode_Resize(v, (int)(p - buf)))
                 goto onError;
      return (PyObject *)v;
      
- ucnhashError:
+ucnhashError:
      PyErr_SetString(
          PyExc_UnicodeError,
          "\\N escapes not supported (can't load unicodedata module)"
          );
      return NULL;
  
- onError:
+onError:
      Py_XDECREF(v);
      return NULL;
  }
author	Fredrik Lundh <fredrik@pythonware.com>
	Sun, 18 Feb 2001 22:13:49 +0000 (22:13 +0000)
committer	Fredrik Lundh <fredrik@pythonware.com>
	Sun, 18 Feb 2001 22:13:49 +0000 (22:13 +0000)