]> granicus.if.org Git - python/commitdiff
Issue #24870: Optimize the ASCII decoder for error handlers: surrogateescape,
authorVictor Stinner <victor.stinner@gmail.com>
Mon, 21 Sep 2015 21:06:27 +0000 (23:06 +0200)
committerVictor Stinner <victor.stinner@gmail.com>
Mon, 21 Sep 2015 21:06:27 +0000 (23:06 +0200)
ignore and replace. Initial patch written by Naoki Inada.

The decoder is now up to 60 times as fast for these error handlers.

Add also unit tests for the ASCII decoder.

Doc/whatsnew/3.6.rst
Lib/test/test_codecs.py
Objects/unicodeobject.c

index 48ff38a241fca2167f66d4c9153517a14157536a..8a2b5d3021df0720bf44cbbe064e4d8c6b4a75cf 100644 (file)
@@ -106,7 +106,8 @@ operator
 Optimizations
 =============
 
-* None yet.
+* The ASCII decoder is now up to 60 times as fast for error handlers:
+  ``surrogateescape``, ``ignore`` and ``replace``.
 
 
 Build and C API Changes
index a4a6f95ca2da229fb3348938634d8c1202d5a74b..e0e31199ccabf01ff9096d14bebbea626d14bbe0 100644 (file)
@@ -27,6 +27,7 @@ def coding_checker(self, coder):
         self.assertEqual(coder(input), (expect, len(input)))
     return check
 
+
 class Queue(object):
     """
     queue: write bytes at one end, read bytes from the other end
@@ -47,6 +48,7 @@ class Queue(object):
             self._buffer = self._buffer[size:]
             return s
 
+
 class MixInCheckStateHandling:
     def check_state_handling_decode(self, encoding, u, s):
         for i in range(len(s)+1):
@@ -80,6 +82,7 @@ class MixInCheckStateHandling:
             part2 = d.encode(u[i:], True)
             self.assertEqual(s, part1+part2)
 
+
 class ReadTest(MixInCheckStateHandling):
     def check_partial(self, input, partialresults):
         # get a StreamReader for the encoding and feed the bytestring version
@@ -383,6 +386,7 @@ class ReadTest(MixInCheckStateHandling):
             self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
                              before + backslashreplace + after)
 
+
 class UTF32Test(ReadTest, unittest.TestCase):
     encoding = "utf-32"
     if sys.byteorder == 'little':
@@ -478,6 +482,7 @@ class UTF32Test(ReadTest, unittest.TestCase):
         self.assertEqual('\U00010000' * 1024,
                          codecs.utf_32_decode(encoded_be)[0])
 
+
 class UTF32LETest(ReadTest, unittest.TestCase):
     encoding = "utf-32-le"
     ill_formed_sequence = b"\x80\xdc\x00\x00"
@@ -523,6 +528,7 @@ class UTF32LETest(ReadTest, unittest.TestCase):
         self.assertEqual('\U00010000' * 1024,
                          codecs.utf_32_le_decode(encoded)[0])
 
+
 class UTF32BETest(ReadTest, unittest.TestCase):
     encoding = "utf-32-be"
     ill_formed_sequence = b"\x00\x00\xdc\x80"
@@ -797,6 +803,7 @@ class UTF8Test(ReadTest, unittest.TestCase):
         with self.assertRaises(UnicodeDecodeError):
             b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
 
+
 @unittest.skipUnless(sys.platform == 'win32',
                      'cp65001 is a Windows-only codec')
 class CP65001Test(ReadTest, unittest.TestCase):
@@ -1136,6 +1143,7 @@ class EscapeDecodeTest(unittest.TestCase):
         self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
         self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
 
+
 class RecodingTest(unittest.TestCase):
     def test_recoding(self):
         f = io.BytesIO()
@@ -1255,6 +1263,7 @@ for i in punycode_testcases:
     if len(i)!=2:
         print(repr(i))
 
+
 class PunycodeTest(unittest.TestCase):
     def test_encode(self):
         for uni, puny in punycode_testcases:
@@ -1274,6 +1283,7 @@ class PunycodeTest(unittest.TestCase):
             puny = puny.decode("ascii").encode("ascii")
             self.assertEqual(uni, puny.decode("punycode"))
 
+
 class UnicodeInternalTest(unittest.TestCase):
     @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
     def test_bug1251300(self):
@@ -1528,6 +1538,7 @@ class NameprepTest(unittest.TestCase):
                 except Exception as e:
                     raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
 
+
 class IDNACodecTest(unittest.TestCase):
     def test_builtin_decode(self):
         self.assertEqual(str(b"python.org", "idna"), "python.org")
@@ -1614,6 +1625,7 @@ class IDNACodecTest(unittest.TestCase):
             self.assertRaises(Exception,
                 b"python.org".decode, "idna", errors)
 
+
 class CodecsModuleTest(unittest.TestCase):
 
     def test_decode(self):
@@ -1722,6 +1734,7 @@ class CodecsModuleTest(unittest.TestCase):
             self.assertRaises(UnicodeError,
                 codecs.decode, b'abc', 'undefined', errors)
 
+
 class StreamReaderTest(unittest.TestCase):
 
     def setUp(self):
@@ -1732,6 +1745,7 @@ class StreamReaderTest(unittest.TestCase):
         f = self.reader(self.stream)
         self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
 
+
 class EncodedFileTest(unittest.TestCase):
 
     def test_basic(self):
@@ -1862,6 +1876,7 @@ broken_unicode_with_stateful = [
     "unicode_internal"
 ]
 
+
 class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
     def test_basics(self):
         s = "abc123"  # all codecs should be able to encode these
@@ -2024,6 +2039,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
                 self.check_state_handling_decode(encoding, u, u.encode(encoding))
                 self.check_state_handling_encode(encoding, u, u.encode(encoding))
 
+
 class CharmapTest(unittest.TestCase):
     def test_decode_with_string_map(self):
         self.assertEqual(
@@ -2274,6 +2290,7 @@ class WithStmtTest(unittest.TestCase):
                                        info.streamwriter, 'strict') as srw:
             self.assertEqual(srw.read(), "\xfc")
 
+
 class TypesTest(unittest.TestCase):
     def test_decode_unicode(self):
         # Most decoders don't accept unicode input
@@ -2564,6 +2581,7 @@ else:
     bytes_transform_encodings.append("bz2_codec")
     transform_aliases["bz2_codec"] = ["bz2"]
 
+
 class TransformCodecTest(unittest.TestCase):
 
     def test_basics(self):
@@ -3041,5 +3059,19 @@ class CodePageTest(unittest.TestCase):
         self.assertEqual(decoded, ('abc', 3))
 
 
+class ASCIITest(unittest.TestCase):
+    def test_decode(self):
+        for data, error_handler, expected in (
+            (b'[\x80\xff]', 'ignore', '[]'),
+            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
+            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
+            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
+        ):
+            with self.subTest(data=data, error_handler=error_handler,
+                              expected=expected):
+                self.assertEqual(data.decode('ascii', error_handler),
+                                 expected)
+
+
 if __name__ == "__main__":
     unittest.main()
index 0709789991e9337537045e963bc95ed9e1d29a03..b8840e1e360e4ab3ea0d9343dc76ec6b5601919b 100644 (file)
@@ -6644,6 +6644,28 @@ PyUnicode_AsLatin1String(PyObject *unicode)
 
 /* --- 7-bit ASCII Codec -------------------------------------------------- */
 
+typedef enum {
+    _Py_ERROR_UNKNOWN=0,
+    _Py_ERROR_SURROGATEESCAPE,
+    _Py_ERROR_REPLACE,
+    _Py_ERROR_IGNORE,
+    _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+    if (errors == NULL)
+        return _Py_ERROR_OTHER;
+    if (strcmp(errors, "surrogateescape") == 0)
+        return _Py_ERROR_SURROGATEESCAPE;
+    if (strcmp(errors, "ignore") == 0)
+        return _Py_ERROR_IGNORE;
+    if (strcmp(errors, "replace") == 0)
+        return _Py_ERROR_REPLACE;
+    return _Py_ERROR_OTHER;
+}
+
 PyObject *
 PyUnicode_DecodeASCII(const char *s,
                       Py_ssize_t size,
@@ -6657,8 +6679,9 @@ PyUnicode_DecodeASCII(const char *s,
     Py_ssize_t endinpos;
     Py_ssize_t outpos;
     const char *e;
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 
     if (size == 0)
         _Py_RETURN_UNICODE_EMPTY();
@@ -6687,12 +6710,45 @@ PyUnicode_DecodeASCII(const char *s,
             PyUnicode_WRITE(kind, data, writer.pos, c);
             writer.pos++;
             ++s;
+            continue;
         }
-        else {
+
+        /* byte outsize range 0x00..0x7f: call the error handler */
+
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler)
+        {
+        case _Py_ERROR_REPLACE:
+        case _Py_ERROR_SURROGATEESCAPE:
+            /* Fast-path: the error handler only writes one character,
+               but we must switch to UCS2 at the first write */
+            if (kind < PyUnicode_2BYTE_KIND) {
+                if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos,
+                                             0xffff) < 0)
+                    return NULL;
+                kind = writer.kind;
+                data = writer.data;
+            }
+
+            if (error_handler == _Py_ERROR_REPLACE)
+                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+            else
+                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+            writer.pos++;
+            ++s;
+            break;
+
+        case _Py_ERROR_IGNORE:
+            ++s;
+            break;
+
+        default:
             startinpos = s-starts;
             endinpos = startinpos + 1;
             if (unicode_decode_call_errorhandler_writer(
-                    errors, &errorHandler,
+                    errors, &error_handler_obj,
                     "ascii", "ordinal not in range(128)",
                     &starts, &e, &startinpos, &endinpos, &exc, &s,
                     &writer))
@@ -6701,13 +6757,13 @@ PyUnicode_DecodeASCII(const char *s,
             data = writer.data;
         }
     }
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return _PyUnicodeWriter_Finish(&writer);
 
   onError:
     _PyUnicodeWriter_Dealloc(&writer);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return NULL;
 }