bpo-36311: Fixes decoding multibyte characters around chunk boundaries and improves...

author Steve Dower <steve.dower@python.org>

Wed, 21 Aug 2019 23:22:33 +0000 (16:22 -0700)

committer GitHub <noreply@github.com>

Wed, 21 Aug 2019 23:22:33 +0000 (16:22 -0700)
author Steve Dower <steve.dower@python.org>
Wed, 21 Aug 2019 23:22:33 +0000 (16:22 -0700)
committer GitHub <noreply@github.com>
Wed, 21 Aug 2019 23:22:33 +0000 (16:22 -0700)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index b187ca650dc693e3b9d76264e0c3b8201d1c8d33..ba7f4847468a3eebd25c214402867665c20f12f7 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3075,13 +3075,13 @@ class CodePageTest(unittest.TestCase):
              self.assertEqual(codec.name, 'mbcs')
  
      @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
-    def test_large_input(self):
+    def test_large_input(self, size):
          # Test input longer than INT_MAX.
          # Input should contain undecodable bytes before and after
          # the INT_MAX limit.
-        encoded = (b'01234567' * (2**28-1) +
+        encoded = (b'01234567' * ((size//8)-1) +
                     b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
-        self.assertEqual(len(encoded), 2**31+2)
+        self.assertEqual(len(encoded), size+2)
          decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
          self.assertEqual(decoded[1], len(encoded))
          del encoded
@@ -3092,6 +3092,20 @@ class CodePageTest(unittest.TestCase):
                           '\udc85\udc86\udcea\udceb\udcec'
                           '\udcef\udcfc\udcfd\udcfe\udcff')
  
+    @support.bigmemtest(size=2**31, memuse=6, dry_run=False)
+    def test_large_utf8_input(self, size):
+        # Test input longer than INT_MAX.
+        # Input should contain a decodable multi-byte character
+        # surrounding INT_MAX
+        encoded = (b'0123456\xed\x84\x80' * (size//8))
+        self.assertEqual(len(encoded), size // 8 * 10)
+        decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
+        self.assertEqual(decoded[1], len(encoded))
+        del encoded
+        self.assertEqual(len(decoded[0]), size)
+        self.assertEqual(decoded[0][:10], '0123456\ud10001')
+        self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')
+
  
  class ASCIITest(unittest.TestCase):
      def test_encode(self):
diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-08-02-15-01-33.bpo-36311.uY5vt-.rst b/Misc/NEWS.d/next/Core and Builtins/2019-08-02-15-01-33.bpo-36311.uY5vt-.rst

new file mode 100644 (file)

index 0000000..c45f222
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2019-08-02-15-01-33.bpo-36311.uY5vt-.rst
@@ -0,0 +1,2 @@
+Decoding bytes objects larger than 2GiB is faster and no longer fails when a
+multibyte characters spans a chunk boundary.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 5545eae79505a380426f8947f886d3a8f1e266bc..aa933773233b587c12a0609495365165c6bc94fb 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -7186,6 +7186,12 @@ PyUnicode_AsASCIIString(PyObject *unicode)
  #define NEED_RETRY
  #endif
  
+/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
+   transcoding from UTF-16), but INT_MAX / 4 perfoms better in
+   both cases also and avoids partial characters overrunning the
+   length limit in MultiByteToWideChar on Windows */
+#define DECODING_CHUNK_SIZE (INT_MAX/4)
+
  #ifndef WC_ERR_INVALID_CHARS
  #  define WC_ERR_INVALID_CHARS 0x0080
  #endif
@@ -7422,8 +7428,8 @@ decode_code_page_stateful(int code_page,
      do
      {
  #ifdef NEED_RETRY
-        if (size > INT_MAX) {
-            chunk_size = INT_MAX;
+        if (size > DECODING_CHUNK_SIZE) {
+            chunk_size = DECODING_CHUNK_SIZE;
              final = 0;
              done = 0;
          }
@@ -7827,10 +7833,8 @@ encode_code_page(int code_page,
      do
      {
  #ifdef NEED_RETRY
-        /* UTF-16 encoding may double the size, so use only INT_MAX/2
-           chunks. */
-        if (len > INT_MAX/2) {
-            chunk_len = INT_MAX/2;
+        if (len > DECODING_CHUNK_SIZE) {
+            chunk_len = DECODING_CHUNK_SIZE;
              done = 0;
          }
          else
author	Steve Dower <steve.dower@python.org>
	Wed, 21 Aug 2019 23:22:33 +0000 (16:22 -0700)
committer	GitHub <noreply@github.com>
	Wed, 21 Aug 2019 23:22:33 +0000 (16:22 -0700)
Lib/test/test_codecs.py		patch \| blob \| history
Misc/NEWS.d/next/Core and Builtins/2019-08-02-15-01-33.bpo-36311.uY5vt-.rst	[new file with mode: 0644]	patch \| blob
Objects/unicodeobject.c		patch \| blob \| history