Issue #18184: PyUnicode_FromFormat() and PyUnicode_FromFormatV() now raise

author Serhiy Storchaka <storchaka@gmail.com>

Sun, 23 Jun 2013 17:21:16 +0000 (20:21 +0300)

committer Serhiy Storchaka <storchaka@gmail.com>

Sun, 23 Jun 2013 17:21:16 +0000 (20:21 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Sun, 23 Jun 2013 17:21:16 +0000 (20:21 +0300)
committer Serhiy Storchaka <storchaka@gmail.com>
Sun, 23 Jun 2013 17:21:16 +0000 (20:21 +0300)
diff --cc Lib/test/test_unicode.py

index 382b4630934d3b8f005bff9a6a5d60b82288bae0,0c82560ca7aae827336a2313db617ef975c11cd9..518d6d60165341f28798463976e4571b055f86be
--- 1/Lib/test/test_unicode.py
--- 2/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@@ -2044,182 -2022,47 +2044,184 @@@ class UnicodeTest(string_tests.CommonTe
               PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
   
           # test "%c"
- -        self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd')
- -        self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff')
+ +        check_format('\uabcd',
+ +                     b'%c', c_int(0xabcd))
+ +        check_format('\U0010ffff',
+ +                     b'%c', c_int(0x10ffff))
+         with self.assertRaises(OverflowError):
+             PyUnicode_FromFormat(b'%c', c_int(0x110000))
           # Issue #18183
- -        self.assertEqual(
- -            PyUnicode_FromFormat(b'%c%c', c_int(0x10000), c_int(0x100000)),
- -            '\U00010000\U00100000')
+ +        check_format('\U00010000\U00100000',
+ +                     b'%c%c', c_int(0x10000), c_int(0x100000))
   
           # test "%"
- -        self.assertEqual(PyUnicode_FromFormat(b'%'), '%')
- -        self.assertEqual(PyUnicode_FromFormat(b'%%'), '%')
- -        self.assertEqual(PyUnicode_FromFormat(b'%%s'), '%s')
- -        self.assertEqual(PyUnicode_FromFormat(b'[%%]'), '[%]')
- -        self.assertEqual(PyUnicode_FromFormat(b'%%%s', b'abc'), '%abc')
+ +        check_format('%',
+ +                     b'%')
+ +        check_format('%',
+ +                     b'%%')
+ +        check_format('%s',
+ +                     b'%%s')
+ +        check_format('[%]',
+ +                     b'[%%]')
+ +        check_format('%abc',
+ +                     b'%%%s', b'abc')
+ +
+ +        # truncated string
+ +        check_format('abc',
+ +                     b'%.3s', b'abcdef')
+ +        check_format('abc[\ufffd',
+ +                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
+ +        check_format("'\\u20acABC'",
+ +                     b'%A', '\u20acABC')
+ +        check_format("'\\u20",
+ +                     b'%.5A', '\u20acABCDEF')
+ +        check_format("'\u20acABC'",
+ +                     b'%R', '\u20acABC')
+ +        check_format("'\u20acA",
+ +                     b'%.3R', '\u20acABCDEF')
+ +        check_format('\u20acAB',
+ +                     b'%.3S', '\u20acABCDEF')
+ +        check_format('\u20acAB',
+ +                     b'%.3U', '\u20acABCDEF')
+ +        check_format('\u20acAB',
+ +                     b'%.3V', '\u20acABCDEF', None)
+ +        check_format('abc[\ufffd',
+ +                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
+ +
+ +        # following tests comes from #7330
+ +        # test width modifier and precision modifier with %S
+ +        check_format("repr=  abc",
+ +                     b'repr=%5S', 'abc')
+ +        check_format("repr=ab",
+ +                     b'repr=%.2S', 'abc')
+ +        check_format("repr=   ab",
+ +                     b'repr=%5.2S', 'abc')
+ +
+ +        # test width modifier and precision modifier with %R
+ +        check_format("repr=   'abc'",
+ +                     b'repr=%8R', 'abc')
+ +        check_format("repr='ab",
+ +                     b'repr=%.3R', 'abc')
+ +        check_format("repr=  'ab",
+ +                     b'repr=%5.3R', 'abc')
+ +
+ +        # test width modifier and precision modifier with %A
+ +        check_format("repr=   'abc'",
+ +                     b'repr=%8A', 'abc')
+ +        check_format("repr='ab",
+ +                     b'repr=%.3A', 'abc')
+ +        check_format("repr=  'ab",
+ +                     b'repr=%5.3A', 'abc')
+ +
+ +        # test width modifier and precision modifier with %s
+ +        check_format("repr=  abc",
+ +                     b'repr=%5s', b'abc')
+ +        check_format("repr=ab",
+ +                     b'repr=%.2s', b'abc')
+ +        check_format("repr=   ab",
+ +                     b'repr=%5.2s', b'abc')
+ +
+ +        # test width modifier and precision modifier with %U
+ +        check_format("repr=  abc",
+ +                     b'repr=%5U', 'abc')
+ +        check_format("repr=ab",
+ +                     b'repr=%.2U', 'abc')
+ +        check_format("repr=   ab",
+ +                     b'repr=%5.2U', 'abc')
+ +
+ +        # test width modifier and precision modifier with %V
+ +        check_format("repr=  abc",
+ +                     b'repr=%5V', 'abc', b'123')
+ +        check_format("repr=ab",
+ +                     b'repr=%.2V', 'abc', b'123')
+ +        check_format("repr=   ab",
+ +                     b'repr=%5.2V', 'abc', b'123')
+ +        check_format("repr=  123",
+ +                     b'repr=%5V', None, b'123')
+ +        check_format("repr=12",
+ +                     b'repr=%.2V', None, b'123')
+ +        check_format("repr=   12",
+ +                     b'repr=%5.2V', None, b'123')
   
           # test integer formats (%i, %d, %u)
- -        self.assertEqual(PyUnicode_FromFormat(b'%03i', c_int(10)), '010')
- -        self.assertEqual(PyUnicode_FromFormat(b'%0.4i', c_int(10)), '0010')
- -        self.assertEqual(PyUnicode_FromFormat(b'%i', c_int(-123)), '-123')
- -        self.assertEqual(PyUnicode_FromFormat(b'%li', c_long(-123)), '-123')
- -        self.assertEqual(PyUnicode_FromFormat(b'%lli', c_longlong(-123)), '-123')
- -        self.assertEqual(PyUnicode_FromFormat(b'%zi', c_ssize_t(-123)), '-123')
- -
- -        self.assertEqual(PyUnicode_FromFormat(b'%d', c_int(-123)), '-123')
- -        self.assertEqual(PyUnicode_FromFormat(b'%ld', c_long(-123)), '-123')
- -        self.assertEqual(PyUnicode_FromFormat(b'%lld', c_longlong(-123)), '-123')
- -        self.assertEqual(PyUnicode_FromFormat(b'%zd', c_ssize_t(-123)), '-123')
- -
- -        self.assertEqual(PyUnicode_FromFormat(b'%u', c_uint(123)), '123')
- -        self.assertEqual(PyUnicode_FromFormat(b'%lu', c_ulong(123)), '123')
- -        self.assertEqual(PyUnicode_FromFormat(b'%llu', c_ulonglong(123)), '123')
- -        self.assertEqual(PyUnicode_FromFormat(b'%zu', c_size_t(123)), '123')
+ +        check_format('010',
+ +                     b'%03i', c_int(10))
+ +        check_format('0010',
+ +                     b'%0.4i', c_int(10))
+ +        check_format('-123',
+ +                     b'%i', c_int(-123))
+ +        check_format('-123',
+ +                     b'%li', c_long(-123))
+ +        check_format('-123',
+ +                     b'%lli', c_longlong(-123))
+ +        check_format('-123',
+ +                     b'%zi', c_ssize_t(-123))
+ +
+ +        check_format('-123',
+ +                     b'%d', c_int(-123))
+ +        check_format('-123',
+ +                     b'%ld', c_long(-123))
+ +        check_format('-123',
+ +                     b'%lld', c_longlong(-123))
+ +        check_format('-123',
+ +                     b'%zd', c_ssize_t(-123))
+ +
+ +        check_format('123',
+ +                     b'%u', c_uint(123))
+ +        check_format('123',
+ +                     b'%lu', c_ulong(123))
+ +        check_format('123',
+ +                     b'%llu', c_ulonglong(123))
+ +        check_format('123',
+ +                     b'%zu', c_size_t(123))
+ +
+ +        # test long output
+ +        min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
+ +        max_longlong = -min_longlong - 1
+ +        check_format(str(min_longlong),
+ +                     b'%lld', c_longlong(min_longlong))
+ +        check_format(str(max_longlong),
+ +                     b'%lld', c_longlong(max_longlong))
+ +        max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
+ +        check_format(str(max_ulonglong),
+ +                     b'%llu', c_ulonglong(max_ulonglong))
+ +        PyUnicode_FromFormat(b'%p', c_void_p(-1))
+ +
+ +        # test padding (width and/or precision)
+ +        check_format('123'.rjust(10, '0'),
+ +                     b'%010i', c_int(123))
+ +        check_format('123'.rjust(100),
+ +                     b'%100i', c_int(123))
+ +        check_format('123'.rjust(100, '0'),
+ +                     b'%.100i', c_int(123))
+ +        check_format('123'.rjust(80, '0').rjust(100),
+ +                     b'%100.80i', c_int(123))
+ +
+ +        check_format('123'.rjust(10, '0'),
+ +                     b'%010u', c_uint(123))
+ +        check_format('123'.rjust(100),
+ +                     b'%100u', c_uint(123))
+ +        check_format('123'.rjust(100, '0'),
+ +                     b'%.100u', c_uint(123))
+ +        check_format('123'.rjust(80, '0').rjust(100),
+ +                     b'%100.80u', c_uint(123))
+ +
+ +        check_format('123'.rjust(10, '0'),
+ +                     b'%010x', c_int(0x123))
+ +        check_format('123'.rjust(100),
+ +                     b'%100x', c_int(0x123))
+ +        check_format('123'.rjust(100, '0'),
+ +                     b'%.100x', c_int(0x123))
+ +        check_format('123'.rjust(80, '0').rjust(100),
+ +                     b'%100.80x', c_int(0x123))
   
           # test %A
- -        text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
- -        self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'")
+ +        check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
+ +                     b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
   
           # test %V
- -        text = PyUnicode_FromFormat(b'repr=%V', 'abc', b'xyz')
- -        self.assertEqual(text, 'repr=abc')
+ +        check_format('repr=abc',
+ +                     b'repr=%V', 'abc', b'xyz')
   
           # Test string decode from parameter of %s using utf-8.
           # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
diff --cc Misc/NEWS
Simple merge
diff --cc Objects/unicodeobject.c

index c40e9ece5acb600cea1acbe447d5eea7e3c2670a,2e40c273a443d7d14cab0d2a121c12561dee8245..5659c71ce8efd2efe629aaf8dba97ac6c4f34caa
--- 1/Objects/unicodeobject.c
--- 2/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@@ -2487,289 -2368,499 +2487,289 @@@ unicode_fromformat_arg(_PyUnicodeWrite
           size_tflag = 1;
           ++f;
       }
- -    if (p_longflag != NULL)
- -        *p_longflag = longflag;
- -    if (p_longlongflag != NULL)
- -        *p_longlongflag = longlongflag;
- -    if (p_size_tflag != NULL)
- -        *p_size_tflag = size_tflag;
- -    return f;
- -}
   
- -/* maximum number of characters required for output of %ld.  21 characters
- -   allows for 64-bit integers (in decimal) and an optional sign. */
- -#define MAX_LONG_CHARS 21
- -/* maximum number of characters required for output of %lld.
- -   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
- -   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
- -#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
+ +    if (f[1] == '\0')
+ +        writer->overallocate = 0;
   
- -PyObject *
- -PyUnicode_FromFormatV(const char *format, va_list vargs)
- -{
- -    va_list count;
- -    Py_ssize_t callcount = 0;
- -    PyObject **callresults = NULL;
- -    PyObject **callresult = NULL;
- -    Py_ssize_t n = 0;
- -    int width = 0;
- -    int precision = 0;
- -    int zeropad;
- -    const char* f;
- -    PyObject *string;
- -    /* used by sprintf */
- -    char fmt[61]; /* should be enough for %0width.precisionlld */
- -    Py_UCS4 maxchar = 127; /* result is ASCII by default */
- -    Py_UCS4 argmaxchar;
- -    Py_ssize_t numbersize = 0;
- -    char *numberresults = NULL;
- -    char *numberresult = NULL;
- -    Py_ssize_t i;
- -    int kind;
- -    void *data;
+ +    switch (*f) {
+ +    case 'c':
+ +    {
+ +        int ordinal = va_arg(*vargs, int);
+ +        if (ordinal < 0 || ordinal > MAX_UNICODE) {
-             PyErr_SetString(PyExc_ValueError,
++            PyErr_SetString(PyExc_OverflowError,
+ +                            "character argument not in range(0x110000)");
+ +            return NULL;
+ +        }
+ +        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
+ +            return NULL;
+ +        break;
+ +    }
   
- -    Py_VA_COPY(count, vargs);
- -    /* step 1: count the number of %S/%R/%A/%s format specifications
- -     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
- -     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
- -     * result in an array)
- -     * also estimate a upper bound for all the number formats in the string,
- -     * numbers will be formatted in step 3 and be kept in a '\0'-separated
- -     * buffer before putting everything together. */
- -    for (f = format; *f; f++) {
- -        if (*f == '%') {
- -            int longlongflag;
- -            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
- -            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
- -            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
- -                ++callcount;
+ +    case 'i':
+ +    case 'd':
+ +    case 'u':
+ +    case 'x':
+ +    {
+ +        /* used by sprintf */
+ +        char fmt[10]; /* should be enough for "%0lld\0" */
+ +        char buffer[MAX_LONG_LONG_CHARS];
+ +        Py_ssize_t arglen;
   
- -            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
+ +        if (*f == 'u') {
+ +            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
+ +
+ +            if (longflag)
+ +                len = sprintf(buffer, fmt,
+ +                        va_arg(*vargs, unsigned long));
   #ifdef HAVE_LONG_LONG
- -                if (longlongflag) {
- -                    if (width < MAX_LONG_LONG_CHARS)
- -                        width = MAX_LONG_LONG_CHARS;
- -                }
- -                else
+ +            else if (longlongflag)
+ +                len = sprintf(buffer, fmt,
+ +                        va_arg(*vargs, unsigned PY_LONG_LONG));
   #endif
- -                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
- -                       including sign.  Decimal takes the most space.  This
- -                       isn't enough for octal.  If a width is specified we
- -                       need more (which we allocate later). */
- -                    if (width < MAX_LONG_CHARS)
- -                        width = MAX_LONG_CHARS;
- -
- -                /* account for the size + '\0' to separate numbers
- -                   inside of the numberresults buffer */
- -                numbersize += (width + 1);
- -            }
+ +            else if (size_tflag)
+ +                len = sprintf(buffer, fmt,
+ +                        va_arg(*vargs, size_t));
+ +            else
+ +                len = sprintf(buffer, fmt,
+ +                        va_arg(*vargs, unsigned int));
+ +        }
+ +        else if (*f == 'x') {
+ +            makefmt(fmt, 0, 0, 0, 'x');
+ +            len = sprintf(buffer, fmt, va_arg(*vargs, int));
+ +        }
+ +        else {
+ +            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
+ +
+ +            if (longflag)
+ +                len = sprintf(buffer, fmt,
+ +                        va_arg(*vargs, long));
+ +#ifdef HAVE_LONG_LONG
+ +            else if (longlongflag)
+ +                len = sprintf(buffer, fmt,
+ +                        va_arg(*vargs, PY_LONG_LONG));
+ +#endif
+ +            else if (size_tflag)
+ +                len = sprintf(buffer, fmt,
+ +                        va_arg(*vargs, Py_ssize_t));
+ +            else
+ +                len = sprintf(buffer, fmt,
+ +                        va_arg(*vargs, int));
           }
- -        else if ((unsigned char)*f > 127) {
- -            PyErr_Format(PyExc_ValueError,
- -                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
- -                "string, got a non-ASCII byte: 0x%02x",
- -                (unsigned char)*f);
+ +        assert(len >= 0);
+ +
+ +        if (precision < len)
+ +            precision = len;
+ +
+ +        arglen = Py_MAX(precision, width);
+ +        assert(ucs1lib_find_max_char((Py_UCS1*)buffer, (Py_UCS1*)buffer + len) <= 127);
+ +        if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
               return NULL;
+ +
+ +        if (width > precision) {
+ +            Py_UCS4 fillchar;
+ +            fill = width - precision;
+ +            fillchar = zeropad?'0':' ';
+ +            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
+ +                return NULL;
+ +            writer->pos += fill;
           }
+ +        if (precision > len) {
+ +            fill = precision - len;
+ +            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
+ +                return NULL;
+ +            writer->pos += fill;
+ +        }
+ +
+ +        unicode_write_cstr(writer->buffer, writer->pos, buffer, len);
+ +        writer->pos += len;
+ +        break;
       }
- -    /* step 2: allocate memory for the results of
- -     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
- -    if (callcount) {
- -        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
- -        if (!callresults) {
- -            PyErr_NoMemory();
+ +
+ +    case 'p':
+ +    {
+ +        char number[MAX_LONG_LONG_CHARS];
+ +
+ +        len = sprintf(number, "%p", va_arg(*vargs, void*));
+ +        assert(len >= 0);
+ +
+ +        /* %p is ill-defined:  ensure leading 0x. */
+ +        if (number[1] == 'X')
+ +            number[1] = 'x';
+ +        else if (number[1] != 'x') {
+ +            memmove(number + 2, number,
+ +                    strlen(number) + 1);
+ +            number[0] = '0';
+ +            number[1] = 'x';
+ +            len += 2;
+ +        }
+ +
+ +        assert(ucs1lib_find_max_char((Py_UCS1*)number, (Py_UCS1*)number + len) <= 127);
+ +        if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
+ +            return NULL;
+ +        unicode_write_cstr(writer->buffer, writer->pos, number, len);
+ +        writer->pos += len;
+ +        break;
+ +    }
+ +
+ +    case 's':
+ +    {
+ +        /* UTF-8 */
+ +        const char *s = va_arg(*vargs, const char*);
+ +        if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
+ +            return NULL;
+ +        break;
+ +    }
+ +
+ +    case 'U':
+ +    {
+ +        PyObject *obj = va_arg(*vargs, PyObject *);
+ +        assert(obj && _PyUnicode_CHECK(obj));
+ +
+ +        if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
               return NULL;
+ +        break;
+ +    }
+ +
+ +    case 'V':
+ +    {
+ +        PyObject *obj = va_arg(*vargs, PyObject *);
+ +        const char *str = va_arg(*vargs, const char *);
+ +        if (obj) {
+ +            assert(_PyUnicode_CHECK(obj));
+ +            if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
+ +                return NULL;
+ +        }
+ +        else {
+ +            assert(str != NULL);
+ +            if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
+ +                return NULL;
           }
- -        callresult = callresults;
+ +        break;
       }
- -    /* step 2.5: allocate memory for the results of formating numbers */
- -    if (numbersize) {
- -        numberresults = PyObject_Malloc(numbersize);
- -        if (!numberresults) {
- -            PyErr_NoMemory();
- -            goto fail;
+ +
+ +    case 'S':
+ +    {
+ +        PyObject *obj = va_arg(*vargs, PyObject *);
+ +        PyObject *str;
+ +        assert(obj);
+ +        str = PyObject_Str(obj);
+ +        if (!str)
+ +            return NULL;
+ +        if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
+ +            Py_DECREF(str);
+ +            return NULL;
+ +        }
+ +        Py_DECREF(str);
+ +        break;
+ +    }
+ +
+ +    case 'R':
+ +    {
+ +        PyObject *obj = va_arg(*vargs, PyObject *);
+ +        PyObject *repr;
+ +        assert(obj);
+ +        repr = PyObject_Repr(obj);
+ +        if (!repr)
+ +            return NULL;
+ +        if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
+ +            Py_DECREF(repr);
+ +            return NULL;
+ +        }
+ +        Py_DECREF(repr);
+ +        break;
+ +    }
+ +
+ +    case 'A':
+ +    {
+ +        PyObject *obj = va_arg(*vargs, PyObject *);
+ +        PyObject *ascii;
+ +        assert(obj);
+ +        ascii = PyObject_ASCII(obj);
+ +        if (!ascii)
+ +            return NULL;
+ +        if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
+ +            Py_DECREF(ascii);
+ +            return NULL;
           }
- -        numberresult = numberresults;
+ +        Py_DECREF(ascii);
+ +        break;
+ +    }
+ +
+ +    case '%':
+ +        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
+ +            return NULL;
+ +        break;
+ +
+ +    default:
+ +        /* if we stumble upon an unknown formatting code, copy the rest
+ +           of the format string to the output string. (we cannot just
+ +           skip the code, since there's no way to know what's in the
+ +           argument list) */
+ +        len = strlen(p);
+ +        if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
+ +            return NULL;
+ +        f = p+len;
+ +        return f;
       }
   
- -    /* step 3: format numbers and figure out how large a buffer we need */
- -    for (f = format; *f; f++) {
+ +    f++;
+ +    return f;
+ +}
+ +
+ +PyObject *
+ +PyUnicode_FromFormatV(const char *format, va_list vargs)
+ +{
+ +    va_list vargs2;
+ +    const char *f;
+ +    _PyUnicodeWriter writer;
+ +
+ +    _PyUnicodeWriter_Init(&writer);
+ +    writer.min_length = strlen(format) + 100;
+ +    writer.overallocate = 1;
+ +
+ +    /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
+ +       Copy it to be able to pass a reference to a subfunction. */
+ +    Py_VA_COPY(vargs2, vargs);
+ +
+ +    for (f = format; *f; ) {
           if (*f == '%') {
- -            const char* p;
- -            int longflag;
- -            int longlongflag;
- -            int size_tflag;
- -            int numprinted;
+ +            f = unicode_fromformat_arg(&writer, f, &vargs2);
+ +            if (f == NULL)
+ +                goto fail;
+ +        }
+ +        else {
+ +            const char *p;
+ +            Py_ssize_t len;
   
               p = f;
- -            zeropad = (f[1] == '0');
- -            f = parse_format_flags(f, &width, &precision,
- -                                   &longflag, &longlongflag, &size_tflag);
- -            switch (*f) {
- -            case 'c':
+ +            do
               {
- -                int ordinal = va_arg(count, int);
- -                if (ordinal < 0 || ordinal > MAX_UNICODE) {
- -                    PyErr_SetString(PyExc_OverflowError,
- -                                    "%c arg not in range(0x110000)");
- -                    goto fail;
+ +                if ((unsigned char)*p > 127) {
+ +                    PyErr_Format(PyExc_ValueError,
+ +                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
+ +                        "string, got a non-ASCII byte: 0x%02x",
+ +                        (unsigned char)*p);
+ +                    return NULL;
                   }
- -                maxchar = Py_MAX(maxchar, (Py_UCS4)ordinal);
- -                n++;
- -                break;
+ +                p++;
               }
- -            case '%':
- -                n++;
- -                break;
- -            case 'i':
- -            case 'd':
- -                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
- -                        width, precision, *f);
- -                if (longflag)
- -                    numprinted = sprintf(numberresult, fmt,
- -                                         va_arg(count, long));
- -#ifdef HAVE_LONG_LONG
- -                else if (longlongflag)
- -                    numprinted = sprintf(numberresult, fmt,
- -                                         va_arg(count, PY_LONG_LONG));
- -#endif
- -                else if (size_tflag)
- -                    numprinted = sprintf(numberresult, fmt,
- -                                         va_arg(count, Py_ssize_t));
- -                else
- -                    numprinted = sprintf(numberresult, fmt,
- -                                         va_arg(count, int));
- -                n += numprinted;
- -                /* advance by +1 to skip over the '\0' */
- -                numberresult += (numprinted + 1);
- -                assert(*(numberresult - 1) == '\0');
- -                assert(*(numberresult - 2) != '\0');
- -                assert(numprinted >= 0);
- -                assert(numberresult <= numberresults + numbersize);
- -                break;
- -            case 'u':
- -                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
- -                        width, precision, 'u');
- -                if (longflag)
- -                    numprinted = sprintf(numberresult, fmt,
- -                                         va_arg(count, unsigned long));
- -#ifdef HAVE_LONG_LONG
- -                else if (longlongflag)
- -                    numprinted = sprintf(numberresult, fmt,
- -                                         va_arg(count, unsigned PY_LONG_LONG));
- -#endif
- -                else if (size_tflag)
- -                    numprinted = sprintf(numberresult, fmt,
- -                                         va_arg(count, size_t));
- -                else
- -                    numprinted = sprintf(numberresult, fmt,
- -                                         va_arg(count, unsigned int));
- -                n += numprinted;
- -                numberresult += (numprinted + 1);
- -                assert(*(numberresult - 1) == '\0');
- -                assert(*(numberresult - 2) != '\0');
- -                assert(numprinted >= 0);
- -                assert(numberresult <= numberresults + numbersize);
- -                break;
- -            case 'x':
- -                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
- -                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
- -                n += numprinted;
- -                numberresult += (numprinted + 1);
- -                assert(*(numberresult - 1) == '\0');
- -                assert(*(numberresult - 2) != '\0');
- -                assert(numprinted >= 0);
- -                assert(numberresult <= numberresults + numbersize);
- -                break;
- -            case 'p':
- -                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
- -                /* %p is ill-defined:  ensure leading 0x. */
- -                if (numberresult[1] == 'X')
- -                    numberresult[1] = 'x';
- -                else if (numberresult[1] != 'x') {
- -                    memmove(numberresult + 2, numberresult,
- -                            strlen(numberresult) + 1);
- -                    numberresult[0] = '0';
- -                    numberresult[1] = 'x';
- -                    numprinted += 2;
- -                }
- -                n += numprinted;
- -                numberresult += (numprinted + 1);
- -                assert(*(numberresult - 1) == '\0');
- -                assert(*(numberresult - 2) != '\0');
- -                assert(numprinted >= 0);
- -                assert(numberresult <= numberresults + numbersize);
- -                break;
- -            case 's':
- -            {
- -                /* UTF-8 */
- -                const char *s = va_arg(count, const char*);
- -                PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
- -                if (!str)
- -                    goto fail;
- -                /* since PyUnicode_DecodeUTF8 returns already flexible
- -                   unicode objects, there is no need to call ready on them */
- -                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
- -                maxchar = Py_MAX(maxchar, argmaxchar);
- -                n += PyUnicode_GET_LENGTH(str);
- -                /* Remember the str and switch to the next slot */
- -                *callresult++ = str;
- -                break;
- -            }
- -            case 'U':
- -            {
- -                PyObject *obj = va_arg(count, PyObject *);
- -                assert(obj && _PyUnicode_CHECK(obj));
- -                if (PyUnicode_READY(obj) == -1)
- -                    goto fail;
- -                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
- -                maxchar = Py_MAX(maxchar, argmaxchar);
- -                n += PyUnicode_GET_LENGTH(obj);
- -                break;
- -            }
- -            case 'V':
- -            {
- -                PyObject *obj = va_arg(count, PyObject *);
- -                const char *str = va_arg(count, const char *);
- -                PyObject *str_obj;
- -                assert(obj || str);
- -                assert(!obj || _PyUnicode_CHECK(obj));
- -                if (obj) {
- -                    if (PyUnicode_READY(obj) == -1)
- -                        goto fail;
- -                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
- -                    maxchar = Py_MAX(maxchar, argmaxchar);
- -                    n += PyUnicode_GET_LENGTH(obj);
- -                    *callresult++ = NULL;
- -                }
- -                else {
- -                    str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
- -                    if (!str_obj)
- -                        goto fail;
- -                    if (PyUnicode_READY(str_obj) == -1) {
- -                        Py_DECREF(str_obj);
- -                        goto fail;
- -                    }
- -                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
- -                    maxchar = Py_MAX(maxchar, argmaxchar);
- -                    n += PyUnicode_GET_LENGTH(str_obj);
- -                    *callresult++ = str_obj;
- -                }
- -                break;
- -            }
- -            case 'S':
- -            {
- -                PyObject *obj = va_arg(count, PyObject *);
- -                PyObject *str;
- -                assert(obj);
- -                str = PyObject_Str(obj);
- -                if (!str)
- -                    goto fail;
- -                if (PyUnicode_READY(str) == -1) {
- -                    Py_DECREF(str);
- -                    goto fail;
- -                }
- -                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
- -                maxchar = Py_MAX(maxchar, argmaxchar);
- -                n += PyUnicode_GET_LENGTH(str);
- -                /* Remember the str and switch to the next slot */
- -                *callresult++ = str;
- -                break;
- -            }
- -            case 'R':
- -            {
- -                PyObject *obj = va_arg(count, PyObject *);
- -                PyObject *repr;
- -                assert(obj);
- -                repr = PyObject_Repr(obj);
- -                if (!repr)
- -                    goto fail;
- -                if (PyUnicode_READY(repr) == -1) {
- -                    Py_DECREF(repr);
- -                    goto fail;
- -                }
- -                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
- -                maxchar = Py_MAX(maxchar, argmaxchar);
- -                n += PyUnicode_GET_LENGTH(repr);
- -                /* Remember the repr and switch to the next slot */
- -                *callresult++ = repr;
- -                break;
- -            }
- -            case 'A':
- -            {
- -                PyObject *obj = va_arg(count, PyObject *);
- -                PyObject *ascii;
- -                assert(obj);
- -                ascii = PyObject_ASCII(obj);
- -                if (!ascii)
- -                    goto fail;
- -                if (PyUnicode_READY(ascii) == -1) {
- -                    Py_DECREF(ascii);
- -                    goto fail;
- -                }
- -                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
- -                maxchar = Py_MAX(maxchar, argmaxchar);
- -                n += PyUnicode_GET_LENGTH(ascii);
- -                /* Remember the repr and switch to the next slot */
- -                *callresult++ = ascii;
- -                break;
- -            }
- -            default:
- -                /* if we stumble upon an unknown
- -                   formatting code, copy the rest of
- -                   the format string to the output
- -                   string. (we cannot just skip the
- -                   code, since there's no way to know
- -                   what's in the argument list) */
- -                n += strlen(p);
- -                goto expand;
- -            }
- -        } else
- -            n++;
- -    }
- -  expand:
- -    /* step 4: fill the buffer */
- -    /* Since we've analyzed how much space we need,
- -       we don't have to resize the string.
- -       There can be no errors beyond this point. */
- -    string = PyUnicode_New(n, maxchar);
- -    if (!string)
- -        goto fail;
- -    kind = PyUnicode_KIND(string);
- -    data = PyUnicode_DATA(string);
- -    callresult = callresults;
- -    numberresult = numberresults;
+ +            while (*p != '\0' && *p != '%');
+ +            len = p - f;
   
- -    for (i = 0, f = format; *f; f++) {
- -        if (*f == '%') {
- -            const char* p;
+ +            if (*p == '\0')
+ +                writer.overallocate = 0;
+ +            if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
+ +                goto fail;
+ +            unicode_write_cstr(writer.buffer, writer.pos, f, len);
+ +            writer.pos += len;
   
- -            p = f;
- -            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
- -            /* checking for == because the last argument could be a empty
- -               string, which causes i to point to end, the assert at the end of
- -               the loop */
- -            assert(i <= PyUnicode_GET_LENGTH(string));
- -
- -            switch (*f) {
- -            case 'c':
- -            {
- -                const int ordinal = va_arg(vargs, int);
- -                PyUnicode_WRITE(kind, data, i++, ordinal);
- -                break;
- -            }
- -            case 'i':
- -            case 'd':
- -            case 'u':
- -            case 'x':
- -            case 'p':
- -            {
- -                Py_ssize_t len;
- -                /* unused, since we already have the result */
- -                if (*f == 'p')
- -                    (void) va_arg(vargs, void *);
- -                else
- -                    (void) va_arg(vargs, int);
- -                /* extract the result from numberresults and append. */
- -                len = strlen(numberresult);
- -                unicode_write_cstr(string, i, numberresult, len);
- -                /* skip over the separating '\0' */
- -                i += len;
- -                numberresult += len;
- -                assert(*numberresult == '\0');
- -                numberresult++;
- -                assert(numberresult <= numberresults + numbersize);
- -                break;
- -            }
- -            case 's':
- -            {
- -                /* unused, since we already have the result */
- -                Py_ssize_t size;
- -                (void) va_arg(vargs, char *);
- -                size = PyUnicode_GET_LENGTH(*callresult);
- -                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
- -                _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
- -                i += size;
- -                /* We're done with the unicode()/repr() => forget it */
- -                Py_DECREF(*callresult);
- -                /* switch to next unicode()/repr() result */
- -                ++callresult;
- -                break;
- -            }
- -            case 'U':
- -            {
- -                PyObject *obj = va_arg(vargs, PyObject *);
- -                Py_ssize_t size;
- -                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
- -                size = PyUnicode_GET_LENGTH(obj);
- -                _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
- -                i += size;
- -                break;
- -            }
- -            case 'V':
- -            {
- -                Py_ssize_t size;
- -                PyObject *obj = va_arg(vargs, PyObject *);
- -                va_arg(vargs, const char *);
- -                if (obj) {
- -                    size = PyUnicode_GET_LENGTH(obj);
- -                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
- -                    _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
- -                    i += size;
- -                } else {
- -                    size = PyUnicode_GET_LENGTH(*callresult);
- -                    assert(PyUnicode_KIND(*callresult) <=
- -                           PyUnicode_KIND(string));
- -                    _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
- -                    i += size;
- -                    Py_DECREF(*callresult);
- -                }
- -                ++callresult;
- -                break;
- -            }
- -            case 'S':
- -            case 'R':
- -            case 'A':
- -            {
- -                Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
- -                /* unused, since we already have the result */
- -                (void) va_arg(vargs, PyObject *);
- -                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
- -                _PyUnicode_FastCopyCharacters(string, i, *callresult, 0,  size);
- -                i += size;
- -                /* We're done with the unicode()/repr() => forget it */
- -                Py_DECREF(*callresult);
- -                /* switch to next unicode()/repr() result */
- -                ++callresult;
- -                break;
- -            }
- -            case '%':
- -                PyUnicode_WRITE(kind, data, i++, '%');
- -                break;
- -            default:
- -            {
- -                Py_ssize_t len = strlen(p);
- -                unicode_write_cstr(string, i, p, len);
- -                i += len;
- -                assert(i == PyUnicode_GET_LENGTH(string));
- -                goto end;
- -            }
- -            }
- -        }
- -        else {
- -            assert(i < PyUnicode_GET_LENGTH(string));
- -            PyUnicode_WRITE(kind, data, i++, *f);
+ +            f = p;
           }
       }
- -    assert(i == PyUnicode_GET_LENGTH(string));
+ +    return _PyUnicodeWriter_Finish(&writer);
   
- -  end:
- -    if (callresults)
- -        PyObject_Free(callresults);
- -    if (numberresults)
- -        PyObject_Free(numberresults);
- -    return unicode_result(string);
     fail:
- -    if (callresults) {
- -        PyObject **callresult2 = callresults;
- -        while (callresult2 < callresult) {
- -            Py_XDECREF(*callresult2);
- -            ++callresult2;
- -        }
- -        PyObject_Free(callresults);
- -    }
- -    if (numberresults)
- -        PyObject_Free(numberresults);
+ +    _PyUnicodeWriter_Dealloc(&writer);
       return NULL;
   }
author	Serhiy Storchaka <storchaka@gmail.com>
	Sun, 23 Jun 2013 17:21:16 +0000 (20:21 +0300)
committer	Serhiy Storchaka <storchaka@gmail.com>
	Sun, 23 Jun 2013 17:21:16 +0000 (20:21 +0300)
		1	2
Lib/test/test_unicode.py	patch \|	diff1 \|	diff2 \|	blob \| history
Misc/NEWS	patch \|	diff1 \|	diff2 \|	blob \| history
Objects/unicodeobject.c	patch \|	diff1 \|	diff2 \|	blob \| history