Close #17694: Add minimum length to _PyUnicodeWriter

author Victor Stinner <victor.stinner@gmail.com>

Wed, 17 Apr 2013 21:02:17 +0000 (23:02 +0200)

committer Victor Stinner <victor.stinner@gmail.com>

Wed, 17 Apr 2013 21:02:17 +0000 (23:02 +0200)
author Victor Stinner <victor.stinner@gmail.com>
Wed, 17 Apr 2013 21:02:17 +0000 (23:02 +0200)
committer Victor Stinner <victor.stinner@gmail.com>
Wed, 17 Apr 2013 21:02:17 +0000 (23:02 +0200)
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h

index d613311ab6f58fccb62016f9c8456ce448ba541c..ed7db28436ebee309466d5fd069b68d3b2935ee7 100644 (file)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -898,22 +898,28 @@ typedef struct {
      Py_UCS4 maxchar;
      Py_ssize_t size;
      Py_ssize_t pos;
-    /* minimum length of the buffer when overallocation is enabled,
-       see _PyUnicodeWriter_Init() */
+
+    /* minimum number of allocated characters (default: 0) */
      Py_ssize_t min_length;
+
+    /* minimum character (default: 127, ASCII) */
+    Py_UCS4 min_char;
+
+    /* If non-zero, overallocate the buffer by 25% (default: 0). */
      unsigned char overallocate;
+
      /* If readonly is 1, buffer is a shared string (cannot be modified)
         and size is set to 0. */
      unsigned char readonly;
  } _PyUnicodeWriter ;
  
  /* Initialize a Unicode writer.
-
-   If min_length is greater than zero, _PyUnicodeWriter_Prepare()
-   overallocates the buffer and min_length is the minimum length in characters
-   of the buffer. */
+ *
+ * By default, the minimum buffer size is 0 character and overallocation is
+ * disabled. Set min_length, min_char and overallocate attributes to control
+ * the allocation of the buffer. */
  PyAPI_FUNC(void)
-_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length);
+_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
  
  /* Prepare the buffer to write 'length' characters
     with the specified maximum character.
diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c

index 4c865eca11458c16ee2d7ca120a92d45e0475f3b..33bd77949392d31c159e3e82cfd75d5d5a9b6c18 100644 (file)
--- a/Modules/cjkcodecs/multibytecodec.c
+++ b/Modules/cjkcodecs/multibytecodec.c
@@ -633,7 +633,8 @@ MultibyteCodec_Decode(MultibyteCodecObject *self,
          return make_tuple(PyUnicode_New(0, 0), 0);
      }
  
-    _PyUnicodeWriter_Init(&buf.writer, datalen);
+    _PyUnicodeWriter_Init(&buf.writer);
+    buf.writer.min_length = datalen;
      buf.excobj = NULL;
      buf.inbuf = buf.inbuf_top = (unsigned char *)data;
      buf.inbuf_end = buf.inbuf_top + datalen;
@@ -839,7 +840,7 @@ decoder_prepare_buffer(MultibyteDecodeBuffer *buf, const char *data,
  {
      buf->inbuf = buf->inbuf_top = (const unsigned char *)data;
      buf->inbuf_end = buf->inbuf_top + size;
-    _PyUnicodeWriter_Init(&buf->writer, size);
+    buf->writer.min_length += size;
      return 0;
  }
  
@@ -1037,7 +1038,7 @@ mbidecoder_decode(MultibyteIncrementalDecoderObject *self,
      data = pdata.buf;
      size = pdata.len;
  
-    _PyUnicodeWriter_Init(&buf.writer, 1);
+    _PyUnicodeWriter_Init(&buf.writer);
      buf.excobj = NULL;
      origpending = self->pendingsize;
  
@@ -1241,7 +1242,7 @@ mbstreamreader_iread(MultibyteStreamReaderObject *self,
      if (sizehint == 0)
          return PyUnicode_New(0, 0);
  
-    _PyUnicodeWriter_Init(&buf.writer, 1);
+    _PyUnicodeWriter_Init(&buf.writer);
      buf.excobj = NULL;
      cres = NULL;
  
diff --git a/Objects/complexobject.c b/Objects/complexobject.c

index 355b063f287dcc70d9efb49052b0bd05b2b9f263..54838ccdbd7f1b041831eb346d64580d9b76b87a 100644 (file)
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@@ -705,7 +705,7 @@ complex__format__(PyObject* self, PyObject* args)
      if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
          return NULL;
  
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
      ret = _PyComplex_FormatAdvancedWriter(
          &writer,
          self,
diff --git a/Objects/floatobject.c b/Objects/floatobject.c

index b571ca8c70ae07cf8ebd80f6900a378dfb502a60..c54c8e1a1da5d6f71a6980d97f86967ca75ad5b2 100644 (file)
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@@ -1711,7 +1711,7 @@ float__format__(PyObject *self, PyObject *args)
      if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
          return NULL;
  
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
      ret = _PyFloat_FormatAdvancedWriter(
          &writer,
          self,
diff --git a/Objects/longobject.c b/Objects/longobject.c

index cdaea027751b14372356c40c55ff8bc37de50038..2b04804216fca965e3f910a6867c5b42c3e41280 100644 (file)
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -4379,7 +4379,7 @@ long__format__(PyObject *self, PyObject *args)
      if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
          return NULL;
  
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
      ret = _PyLong_FormatAdvancedWriter(
          &writer,
          self,
diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h

index 2f58946ec35000b10114b39b306176a5a9e21224..942916938c8222a28d9df6b1c13be6e667051a56 100644 (file)
--- a/Objects/stringlib/unicode_format.h
+++ b/Objects/stringlib/unicode_format.h
@@ -906,7 +906,6 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs,
               int recursion_depth, AutoNumber *auto_number)
  {
      _PyUnicodeWriter writer;
-    Py_ssize_t minlen;
  
      /* check the recursion level */
      if (recursion_depth <= 0) {
@@ -915,8 +914,9 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs,
          return NULL;
      }
  
-    minlen = PyUnicode_GET_LENGTH(input->str) + 100;
-    _PyUnicodeWriter_Init(&writer, minlen);
+    _PyUnicodeWriter_Init(&writer);
+    writer.overallocate = 1;
+    writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
  
      if (!do_markup(input, args, kwargs, &writer, recursion_depth,
                     auto_number)) {
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 748fcc76654ca8cf52980a595e570a1c12bad357..c4157d8270f16775b1329e3a534d7f485a5da82e 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2665,7 +2665,9 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
      const char *f;
      _PyUnicodeWriter writer;
  
-    _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
+    _PyUnicodeWriter_Init(&writer);
+    writer.min_length = strlen(format) + 100;
+    writer.overallocate = 1;
  
      /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
         Copy it to be able to pass a reference to a subfunction. */
@@ -4117,7 +4119,10 @@ unicode_decode_call_errorhandler_writer(
          goto onError;
      }
  
-    writer->overallocate = 1;
+    if (PyUnicode_READY(repunicode) < 0)
+        goto onError;
+    if (PyUnicode_GET_LENGTH(repunicode) > 1)
+        writer->overallocate = 1;
      if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
          return
  
@@ -4256,9 +4261,8 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
      }
  
      /* Start off assuming it's all ASCII. Widen later as necessary. */
-    _PyUnicodeWriter_Init(&writer, 0);
-    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
-        goto onError;
+    _PyUnicodeWriter_Init(&writer);
+    writer.min_length = size;
  
      shiftOutStart = 0;
      e = s + size;
@@ -4655,7 +4659,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
          return get_latin1_char((unsigned char)s[0]);
      }
  
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
      if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
          goto onError;
  
@@ -4910,7 +4914,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
      le = bo <= 0;
  #endif
  
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
      if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
          goto onError;
  
@@ -5149,7 +5153,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
  
      /* Note: size will always be longer than the resulting Unicode
         character count */
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
      if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
          goto onError;
  
@@ -5420,11 +5424,9 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
         and we determined it's exact size (common case)
         or it contains \x, \u, ... escape sequences.  then we create a
         legacy wchar string and resize it at the end of this function. */
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
      if (len > 0) {
-        if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
-            goto onError;
-        assert(writer.kind == PyUnicode_1BYTE_KIND);
+        writer.min_length = len;
      }
      else {
          /* Escaped strings will always be longer than the resulting
@@ -5432,8 +5434,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
             length after conversion to the true value.
             (but if the error callback returns a long replacement string
             we'll have to allocate more space) */
-        if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
-            goto onError;
+        writer.min_length = size;
      }
  
      if (size == 0)
@@ -5461,10 +5462,6 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
          if (s > end)
              c = '\0'; /* Invalid after \ */
  
-        /* The only case in which i == ascii_length is a backslash
-           followed by a newline. */
-        assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
-
          switch (c) {
  
              /* \x escapes */
@@ -5787,9 +5784,8 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
         Unicode string, so we start with size here and then reduce the
         length after conversion to the true value. (But decoding error
         handler might have to resize the string) */
-    _PyUnicodeWriter_Init(&writer, 1);
-    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
-        goto onError;
+    _PyUnicodeWriter_Init(&writer);
+    writer.min_length = size;
  
      end = s + size;
      while (s < end) {
@@ -5982,12 +5978,14 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
      if (size == 0)
          _Py_RETURN_UNICODE_EMPTY();
  
-    /* XXX overflow detection missing */
-    _PyUnicodeWriter_Init(&writer, 0);
-    if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
+    _PyUnicodeWriter_Init(&writer);
+    if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
+        PyErr_NoMemory();
          goto onError;
-    end = s + size;
+    }
+    writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
  
+    end = s + size;
      while (s < end) {
          Py_UNICODE uch;
          Py_UCS4 ch;
@@ -6429,9 +6427,9 @@ PyUnicode_DecodeASCII(const char *s,
      if (size == 1 && (unsigned char)s[0] < 128)
          return get_latin1_char((unsigned char)s[0]);
  
-    _PyUnicodeWriter_Init(&writer, 0);
-    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
-        goto onError;
+    _PyUnicodeWriter_Init(&writer);
+    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0)
+        return NULL;
  
      e = s + size;
      data = writer.data;
@@ -7280,7 +7278,7 @@ PyUnicode_DecodeCharmap(const char *s,
  
      if (size == 0)
          _Py_RETURN_UNICODE_EMPTY();
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
      if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
          goto onError;
  
@@ -7312,7 +7310,7 @@ PyUnicode_DecodeCharmap(const char *s,
                  ch = *s;
                  x = mapdata_ucs1[ch];
                  if (x > maxchar) {
-                    if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1)
+                    if (_PyUnicodeWriter_Prepare(&writer, 1, 0xff) == -1)
                          goto onError;
                      maxchar = writer.maxchar;
                      outdata = (Py_UCS1 *)writer.data;
@@ -12841,21 +12839,27 @@ unicode_endswith(PyObject *self,
  Py_LOCAL_INLINE(void)
  _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
  {
-    writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+    if (!writer->readonly)
+        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+    else {
+        /* Copy-on-write mode: set buffer size to 0 so
+         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
+         * next write. */
+        writer->size = 0;
+    }
      writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
      writer->data = PyUnicode_DATA(writer->buffer);
      writer->kind = PyUnicode_KIND(writer->buffer);
  }
  
  void
-_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
+_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
  {
      memset(writer, 0, sizeof(*writer));
  #ifdef Py_DEBUG
      writer->kind = 5;    /* invalid kind */
  #endif
-    writer->min_length = Py_MAX(min_length, 100);
-    writer->overallocate = (min_length > 0);
+    writer->min_char = 127;
  }
  
  int
@@ -12873,29 +12877,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
      }
      newlen = writer->pos + length;
  
+    maxchar = MAX_MAXCHAR(maxchar, writer->min_char);
+
      if (writer->buffer == NULL) {
-        if (writer->overallocate) {
+        assert(!writer->readonly);
+        if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
              /* overallocate 25% to limit the number of resize */
-            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
-                newlen += newlen / 4;
-            if (newlen < writer->min_length)
-                newlen = writer->min_length;
+            newlen += newlen / 4;
          }
+        if (newlen < writer->min_length)
+            newlen = writer->min_length;
+
          writer->buffer = PyUnicode_New(newlen, maxchar);
          if (writer->buffer == NULL)
              return -1;
-        _PyUnicodeWriter_Update(writer);
-        return 0;
      }
-
-    if (newlen > writer->size) {
-        if (writer->overallocate) {
+    else if (newlen > writer->size) {
+        if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
              /* overallocate 25% to limit the number of resize */
-            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
-                newlen += newlen / 4;
-            if (newlen < writer->min_length)
-                newlen = writer->min_length;
+            newlen += newlen / 4;
          }
+        if (newlen < writer->min_length)
+            newlen = writer->min_length;
  
          if (maxchar > writer->maxchar || writer->readonly) {
              /* resize + widen */
@@ -12913,7 +12916,6 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
                  return -1;
          }
          writer->buffer = newbuffer;
-        _PyUnicodeWriter_Update(writer);
      }
      else if (maxchar > writer->maxchar) {
          assert(!writer->readonly);
@@ -12924,8 +12926,8 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
                                        writer->buffer, 0, writer->pos);
          Py_DECREF(writer->buffer);
          writer->buffer = newbuffer;
-        _PyUnicodeWriter_Update(writer);
      }
+    _PyUnicodeWriter_Update(writer);
      return 0;
  }
  
@@ -12959,11 +12961,10 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
      maxchar = PyUnicode_MAX_CHAR_VALUE(str);
      if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
          if (writer->buffer == NULL && !writer->overallocate) {
+            writer->readonly = 1;
              Py_INCREF(str);
              writer->buffer = str;
              _PyUnicodeWriter_Update(writer);
-            writer->readonly = 1;
-            writer->size = 0;
              writer->pos += len;
              return 0;
          }
@@ -13080,7 +13081,7 @@ unicode__format__(PyObject* self, PyObject* args)
  
      if (PyUnicode_READY(self) == -1)
          return NULL;
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
      ret = _PyUnicode_FormatAdvancedWriter(&writer,
                                            self, format_spec, 0,
                                            PyUnicode_GET_LENGTH(format_spec));
@@ -14164,7 +14165,9 @@ PyUnicode_Format(PyObject *format, PyObject *args)
      ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
      ctx.fmtpos = 0;
  
-    _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
+    _PyUnicodeWriter_Init(&ctx.writer);
+    ctx.writer.min_length = ctx.fmtcnt + 100;
+    ctx.writer.overallocate = 1;
  
      if (PyTuple_Check(args)) {
          ctx.arglen = PyTuple_Size(args);
author	Victor Stinner <victor.stinner@gmail.com>
	Wed, 17 Apr 2013 21:02:17 +0000 (23:02 +0200)
committer	Victor Stinner <victor.stinner@gmail.com>
	Wed, 17 Apr 2013 21:02:17 +0000 (23:02 +0200)
Include/unicodeobject.h		patch \| blob \| history
Modules/cjkcodecs/multibytecodec.c		patch \| blob \| history
Objects/complexobject.c		patch \| blob \| history
Objects/floatobject.c		patch \| blob \| history
Objects/longobject.c		patch \| blob \| history
Objects/stringlib/unicode_format.h		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history