Issue #24870: Optimize the ASCII decoder for error handlers: surrogateescape,

author Victor Stinner <victor.stinner@gmail.com>

Mon, 21 Sep 2015 21:06:27 +0000 (23:06 +0200)

committer Victor Stinner <victor.stinner@gmail.com>

Mon, 21 Sep 2015 21:06:27 +0000 (23:06 +0200)
author Victor Stinner <victor.stinner@gmail.com>
Mon, 21 Sep 2015 21:06:27 +0000 (23:06 +0200)
committer Victor Stinner <victor.stinner@gmail.com>
Mon, 21 Sep 2015 21:06:27 +0000 (23:06 +0200)
diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst

index 48ff38a241fca2167f66d4c9153517a14157536a..8a2b5d3021df0720bf44cbbe064e4d8c6b4a75cf 100644 (file)
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@@ -106,7 +106,8 @@ operator
  Optimizations
  =============
  
-* None yet.
+* The ASCII decoder is now up to 60 times as fast for error handlers:
+  ``surrogateescape``, ``ignore`` and ``replace``.
  
  
  Build and C API Changes
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index a4a6f95ca2da229fb3348938634d8c1202d5a74b..e0e31199ccabf01ff9096d14bebbea626d14bbe0 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -27,6 +27,7 @@ def coding_checker(self, coder):
          self.assertEqual(coder(input), (expect, len(input)))
      return check
  
+
  class Queue(object):
      """
      queue: write bytes at one end, read bytes from the other end
@@ -47,6 +48,7 @@ class Queue(object):
              self._buffer = self._buffer[size:]
              return s
  
+
  class MixInCheckStateHandling:
      def check_state_handling_decode(self, encoding, u, s):
          for i in range(len(s)+1):
@@ -80,6 +82,7 @@ class MixInCheckStateHandling:
              part2 = d.encode(u[i:], True)
              self.assertEqual(s, part1+part2)
  
+
  class ReadTest(MixInCheckStateHandling):
      def check_partial(self, input, partialresults):
          # get a StreamReader for the encoding and feed the bytestring version
@@ -383,6 +386,7 @@ class ReadTest(MixInCheckStateHandling):
              self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
                               before + backslashreplace + after)
  
+
  class UTF32Test(ReadTest, unittest.TestCase):
      encoding = "utf-32"
      if sys.byteorder == 'little':
@@ -478,6 +482,7 @@ class UTF32Test(ReadTest, unittest.TestCase):
          self.assertEqual('\U00010000' * 1024,
                           codecs.utf_32_decode(encoded_be)[0])
  
+
  class UTF32LETest(ReadTest, unittest.TestCase):
      encoding = "utf-32-le"
      ill_formed_sequence = b"\x80\xdc\x00\x00"
@@ -523,6 +528,7 @@ class UTF32LETest(ReadTest, unittest.TestCase):
          self.assertEqual('\U00010000' * 1024,
                           codecs.utf_32_le_decode(encoded)[0])
  
+
  class UTF32BETest(ReadTest, unittest.TestCase):
      encoding = "utf-32-be"
      ill_formed_sequence = b"\x00\x00\xdc\x80"
@@ -797,6 +803,7 @@ class UTF8Test(ReadTest, unittest.TestCase):
          with self.assertRaises(UnicodeDecodeError):
              b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
  
+
  @unittest.skipUnless(sys.platform == 'win32',
                       'cp65001 is a Windows-only codec')
  class CP65001Test(ReadTest, unittest.TestCase):
@@ -1136,6 +1143,7 @@ class EscapeDecodeTest(unittest.TestCase):
          self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
          self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
  
+
  class RecodingTest(unittest.TestCase):
      def test_recoding(self):
          f = io.BytesIO()
@@ -1255,6 +1263,7 @@ for i in punycode_testcases:
      if len(i)!=2:
          print(repr(i))
  
+
  class PunycodeTest(unittest.TestCase):
      def test_encode(self):
          for uni, puny in punycode_testcases:
@@ -1274,6 +1283,7 @@ class PunycodeTest(unittest.TestCase):
              puny = puny.decode("ascii").encode("ascii")
              self.assertEqual(uni, puny.decode("punycode"))
  
+
  class UnicodeInternalTest(unittest.TestCase):
      @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
      def test_bug1251300(self):
@@ -1528,6 +1538,7 @@ class NameprepTest(unittest.TestCase):
                  except Exception as e:
                      raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
  
+
  class IDNACodecTest(unittest.TestCase):
      def test_builtin_decode(self):
          self.assertEqual(str(b"python.org", "idna"), "python.org")
@@ -1614,6 +1625,7 @@ class IDNACodecTest(unittest.TestCase):
              self.assertRaises(Exception,
                  b"python.org".decode, "idna", errors)
  
+
  class CodecsModuleTest(unittest.TestCase):
  
      def test_decode(self):
@@ -1722,6 +1734,7 @@ class CodecsModuleTest(unittest.TestCase):
              self.assertRaises(UnicodeError,
                  codecs.decode, b'abc', 'undefined', errors)
  
+
  class StreamReaderTest(unittest.TestCase):
  
      def setUp(self):
@@ -1732,6 +1745,7 @@ class StreamReaderTest(unittest.TestCase):
          f = self.reader(self.stream)
          self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
  
+
  class EncodedFileTest(unittest.TestCase):
  
      def test_basic(self):
@@ -1862,6 +1876,7 @@ broken_unicode_with_stateful = [
      "unicode_internal"
  ]
  
+
  class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
      def test_basics(self):
          s = "abc123"  # all codecs should be able to encode these
@@ -2024,6 +2039,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
                  self.check_state_handling_decode(encoding, u, u.encode(encoding))
                  self.check_state_handling_encode(encoding, u, u.encode(encoding))
  
+
  class CharmapTest(unittest.TestCase):
      def test_decode_with_string_map(self):
          self.assertEqual(
@@ -2274,6 +2290,7 @@ class WithStmtTest(unittest.TestCase):
                                         info.streamwriter, 'strict') as srw:
              self.assertEqual(srw.read(), "\xfc")
  
+
  class TypesTest(unittest.TestCase):
      def test_decode_unicode(self):
          # Most decoders don't accept unicode input
@@ -2564,6 +2581,7 @@ else:
      bytes_transform_encodings.append("bz2_codec")
      transform_aliases["bz2_codec"] = ["bz2"]
  
+
  class TransformCodecTest(unittest.TestCase):
  
      def test_basics(self):
@@ -3041,5 +3059,19 @@ class CodePageTest(unittest.TestCase):
          self.assertEqual(decoded, ('abc', 3))
  
  
+class ASCIITest(unittest.TestCase):
+    def test_decode(self):
+        for data, error_handler, expected in (
+            (b'[\x80\xff]', 'ignore', '[]'),
+            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
+            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
+            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
+        ):
+            with self.subTest(data=data, error_handler=error_handler,
+                              expected=expected):
+                self.assertEqual(data.decode('ascii', error_handler),
+                                 expected)
+
+
  if __name__ == "__main__":
      unittest.main()
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 0709789991e9337537045e963bc95ed9e1d29a03..b8840e1e360e4ab3ea0d9343dc76ec6b5601919b 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6644,6 +6644,28 @@ PyUnicode_AsLatin1String(PyObject *unicode)
  
  /* --- 7-bit ASCII Codec -------------------------------------------------- */
  
+typedef enum {
+    _Py_ERROR_UNKNOWN=0,
+    _Py_ERROR_SURROGATEESCAPE,
+    _Py_ERROR_REPLACE,
+    _Py_ERROR_IGNORE,
+    _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+    if (errors == NULL)
+        return _Py_ERROR_OTHER;
+    if (strcmp(errors, "surrogateescape") == 0)
+        return _Py_ERROR_SURROGATEESCAPE;
+    if (strcmp(errors, "ignore") == 0)
+        return _Py_ERROR_IGNORE;
+    if (strcmp(errors, "replace") == 0)
+        return _Py_ERROR_REPLACE;
+    return _Py_ERROR_OTHER;
+}
+
  PyObject *
  PyUnicode_DecodeASCII(const char *s,
                        Py_ssize_t size,
@@ -6657,8 +6679,9 @@ PyUnicode_DecodeASCII(const char *s,
      Py_ssize_t endinpos;
      Py_ssize_t outpos;
      const char *e;
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
      PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
  
      if (size == 0)
          _Py_RETURN_UNICODE_EMPTY();
@@ -6687,12 +6710,45 @@ PyUnicode_DecodeASCII(const char *s,
              PyUnicode_WRITE(kind, data, writer.pos, c);
              writer.pos++;
              ++s;
+            continue;
          }
-        else {
+
+        /* byte outsize range 0x00..0x7f: call the error handler */
+
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler)
+        {
+        case _Py_ERROR_REPLACE:
+        case _Py_ERROR_SURROGATEESCAPE:
+            /* Fast-path: the error handler only writes one character,
+               but we must switch to UCS2 at the first write */
+            if (kind < PyUnicode_2BYTE_KIND) {
+                if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos,
+                                             0xffff) < 0)
+                    return NULL;
+                kind = writer.kind;
+                data = writer.data;
+            }
+
+            if (error_handler == _Py_ERROR_REPLACE)
+                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+            else
+                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+            writer.pos++;
+            ++s;
+            break;
+
+        case _Py_ERROR_IGNORE:
+            ++s;
+            break;
+
+        default:
              startinpos = s-starts;
              endinpos = startinpos + 1;
              if (unicode_decode_call_errorhandler_writer(
-                    errors, &errorHandler,
+                    errors, &error_handler_obj,
                      "ascii", "ordinal not in range(128)",
                      &starts, &e, &startinpos, &endinpos, &exc, &s,
                      &writer))
@@ -6701,13 +6757,13 @@ PyUnicode_DecodeASCII(const char *s,
              data = writer.data;
          }
      }
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
      Py_XDECREF(exc);
      return _PyUnicodeWriter_Finish(&writer);
  
    onError:
      _PyUnicodeWriter_Dealloc(&writer);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
      Py_XDECREF(exc);
      return NULL;
  }
author	Victor Stinner <victor.stinner@gmail.com>
	Mon, 21 Sep 2015 21:06:27 +0000 (23:06 +0200)
committer	Victor Stinner <victor.stinner@gmail.com>
	Mon, 21 Sep 2015 21:06:27 +0000 (23:06 +0200)
Doc/whatsnew/3.6.rst		patch \| blob \| history
Lib/test/test_codecs.py		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history