Unicode: resize_compact() and resize_inplace() fills also the Unicode strings

author Victor Stinner <victor.stinner@gmail.com>

Wed, 3 Oct 2012 21:03:17 +0000 (23:03 +0200)

committer Victor Stinner <victor.stinner@gmail.com>

Wed, 3 Oct 2012 21:03:17 +0000 (23:03 +0200)
author Victor Stinner <victor.stinner@gmail.com>
Wed, 3 Oct 2012 21:03:17 +0000 (23:03 +0200)
committer Victor Stinner <victor.stinner@gmail.com>
Wed, 3 Oct 2012 21:03:17 +0000 (23:03 +0200)
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 83f2a2a9ba8d3210769ca70a51a5e1c4264eef0f..09067e919c46dd4f4bb7d4e6042ac7a55d858c39 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -640,6 +640,25 @@ Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
      }
  }
  
+#ifdef Py_DEBUG
+/* Fill the data of an Unicode string with invalid characters to detect bugs
+   earlier.
+
+   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
+   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
+   invalid character in Unicode 6.0. */
+static void
+unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
+{
+    int kind = PyUnicode_KIND(unicode);
+    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
+    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
+    if (length <= old_length)
+        return;
+    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
+}
+#endif
+
  static PyObject*
  resize_compact(PyObject *unicode, Py_ssize_t length)
  {
@@ -648,6 +667,10 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
      Py_ssize_t new_size;
      int share_wstr;
      PyObject *new_unicode;
+#ifdef Py_DEBUG
+    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
+#endif
+
      assert(unicode_modifiable(unicode));
      assert(PyUnicode_IS_READY(unicode));
      assert(PyUnicode_IS_COMPACT(unicode));
@@ -683,6 +706,9 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
          if (!PyUnicode_IS_ASCII(unicode))
              _PyUnicode_WSTR_LENGTH(unicode) = length;
      }
+#ifdef Py_DEBUG
+    unicode_fill_invalid(unicode, old_length);
+#endif
      PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
                      length, 0);
      assert(_PyUnicode_CheckConsistency(unicode, 0));
@@ -701,6 +727,9 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
          Py_ssize_t char_size;
          int share_wstr, share_utf8;
          void *data;
+#ifdef Py_DEBUG
+        Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
+#endif
  
          data = _PyUnicode_DATA_ANY(unicode);
          char_size = PyUnicode_KIND(unicode);
@@ -736,6 +765,9 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
          }
          _PyUnicode_LENGTH(unicode) = length;
          PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
+#ifdef Py_DEBUG
+        unicode_fill_invalid(unicode, old_length);
+#endif
          if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
              assert(_PyUnicode_CheckConsistency(unicode, 0));
              return 0;
@@ -1060,11 +1092,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
          }
      }
  #ifdef Py_DEBUG
-    /* Fill the data with invalid characters to detect bugs earlier.
-       _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
-       at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
-       and U+FFFFFFFF is an invalid character in Unicode 6.0. */
-    memset(data, 0xff, size * kind);
+    unicode_fill_invalid((PyObject*)unicode, 0);
  #endif
      assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
      return obj;
author	Victor Stinner <victor.stinner@gmail.com>
	Wed, 3 Oct 2012 21:03:17 +0000 (23:03 +0200)
committer	Victor Stinner <victor.stinner@gmail.com>
	Wed, 3 Oct 2012 21:03:17 +0000 (23:03 +0200)