Fix for the UTF-8 memory allocation bug and the UTF-8 encoding

author Marc-André Lemburg <mal@egenix.com>

Wed, 6 Feb 2002 18:09:02 +0000 (18:09 +0000)

committer Marc-André Lemburg <mal@egenix.com>

Wed, 6 Feb 2002 18:09:02 +0000 (18:09 +0000)
author Marc-André Lemburg <mal@egenix.com>
Wed, 6 Feb 2002 18:09:02 +0000 (18:09 +0000)
committer Marc-André Lemburg <mal@egenix.com>
Wed, 6 Feb 2002 18:09:02 +0000 (18:09 +0000)
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py

index 8d4bed56474136464c6e1d15c9630628b4a4e1ab..5368f6ef6064a3855fb691798c2154976e82470c 100644 (file)
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -495,19 +495,18 @@ else:
  verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
  
  # UTF-8 specific encoding tests:
-verify(u'\u20ac'.encode('utf-8') == \
-       ''.join((chr(0xe2), chr(0x82), chr(0xac))) )
-verify(u'\ud800\udc02'.encode('utf-8') == \
-       ''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))) )
-verify(u'\ud84d\udc56'.encode('utf-8') == \
-       ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
+verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
+verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
+verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
+verify(u'\ud800'.encode('utf-8') == '\xed\xa0\x80')
+verify(u'\udc00'.encode('utf-8') == '\xed\xb0\x80')
+verify((u'\ud800\udc02'*1000).encode('utf-8') ==
+       '\xf0\x90\x80\x82'*1000)
+
  # UTF-8 specific decoding tests
-verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
-               'utf-8') == u'\U00023456' )
-verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
-               'utf-8') == u'\U00010002' )
-verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
-               'utf-8') == u'\u20ac' )
+verify(unicode('\xf0\xa3\x91\x96', 'utf-8') == u'\U00023456' )
+verify(unicode('\xf0\x90\x80\x82', 'utf-8') == u'\U00010002' )
+verify(unicode('\xe2\x82\xac', 'utf-8') == u'\u20ac' )
  
  # Other possible utf-8 test cases:
  # * strict decoding testing for all of the
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 68afaa05c85fb25c5e3c1a7932b0878d36f2ee9f..c7e5c8af2ebaf29467ab0f8ee57d6bca5d879cf1 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1171,61 +1171,64 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
  {
      PyObject *v;
      char *p;
-    char *q;
-    Py_UCS4 ch2;
-    unsigned int cbAllocated = 3 * size;
+    unsigned int cbAllocated = 2 * size;
      unsigned int cbWritten = 0;
      int i = 0;
  
-    v = PyString_FromStringAndSize(NULL, cbAllocated);
+    v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
      if (v == NULL)
          return NULL;
      if (size == 0)
          return v;
  
-    p = q = PyString_AS_STRING(v);
+    p = PyString_AS_STRING(v);
      while (i < size) {
          Py_UCS4 ch = s[i++];
+
          if (ch < 0x80) {
              *p++ = (char) ch;
              cbWritten++;
          }
+
          else if (ch < 0x0800) {
              *p++ = 0xc0 | (ch >> 6);
              *p++ = 0x80 | (ch & 0x3f);
              cbWritten += 2;
          }
-        else if (ch < 0x10000) {
-            /* Check for high surrogate */
-            if (0xD800 <= ch && ch <= 0xDBFF) {
-                if (i != size) {
-                    ch2 = s[i];
-                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
                          
-                        if (cbWritten >= (cbAllocated - 4)) {
-                           /* Provide enough room for some more
-                              surrogates */
-                           cbAllocated += 4*10;
-                            if (_PyString_Resize(&v, cbAllocated))
+        else {
+           
+           /* Assure that we have enough room for high order Unicode
+              ordinals */
+           if (cbWritten >= cbAllocated) {
+               cbAllocated += 4 * 10;
+               if (_PyString_Resize(&v, cbAllocated + 4))
                                 goto onError;
+               p = PyString_AS_STRING(v) + cbWritten;
                          }
  
-                        /* combine the two values */
+           if (ch < 0x10000) {
+               /* Check for high surrogate */
+               if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
+                   Py_UCS4 ch2 = s[i];
+                   /* Check for low surrogate */
+                   if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
                          ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
-                    
                          *p++ = (char)((ch >> 18) | 0xf0);
                          *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
+                       *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+                       *p++ = (char)(0x80 | (ch & 0x3f));
                          i++;
                          cbWritten += 4;
+                       continue;
                      }
+                   /* Fall through: handles isolated high surrogates */
                  }
-            }
-            else {
                  *p++ = (char)(0xe0 | (ch >> 12));
-                cbWritten += 3;
-            }
              *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
              *p++ = (char)(0x80 | (ch & 0x3f));
+               cbWritten += 3;
+    
          } else {
              *p++ = 0xf0 | (ch>>18);
              *p++ = 0x80 | ((ch>>12) & 0x3f);
@@ -1234,8 +1237,9 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
              cbWritten += 4;
         }
      }
+    }
      *p = '\0';
-    if (_PyString_Resize(&v, p - q))
+    if (_PyString_Resize(&v, cbWritten))
         goto onError;
      return v;
author	Marc-André Lemburg <mal@egenix.com>
	Wed, 6 Feb 2002 18:09:02 +0000 (18:09 +0000)
committer	Marc-André Lemburg <mal@egenix.com>
	Wed, 6 Feb 2002 18:09:02 +0000 (18:09 +0000)
Lib/test/test_unicode.py		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history