bpo-36311: Fixes decoding multibyte characters around chunk boundaries and improves...
authorSteve Dower <steve.dower@python.org>
Wed, 21 Aug 2019 23:22:33 +0000 (16:22 -0700)
committerGitHub <noreply@github.com>
Wed, 21 Aug 2019 23:22:33 +0000 (16:22 -0700)
Lib/test/test_codecs.py
Misc/NEWS.d/next/Core and Builtins/2019-08-02-15-01-33.bpo-36311.uY5vt-.rst [new file with mode: 0644]
Objects/unicodeobject.c

index b187ca650dc693e3b9d76264e0c3b8201d1c8d33..ba7f4847468a3eebd25c214402867665c20f12f7 100644 (file)
@@ -3075,13 +3075,13 @@ class CodePageTest(unittest.TestCase):
             self.assertEqual(codec.name, 'mbcs')
 
     @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
-    def test_large_input(self):
+    def test_large_input(self, size):
         # Test input longer than INT_MAX.
         # Input should contain undecodable bytes before and after
         # the INT_MAX limit.
-        encoded = (b'01234567' * (2**28-1) +
+        encoded = (b'01234567' * ((size//8)-1) +
                    b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
-        self.assertEqual(len(encoded), 2**31+2)
+        self.assertEqual(len(encoded), size+2)
         decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
         self.assertEqual(decoded[1], len(encoded))
         del encoded
@@ -3092,6 +3092,20 @@ class CodePageTest(unittest.TestCase):
                          '\udc85\udc86\udcea\udceb\udcec'
                          '\udcef\udcfc\udcfd\udcfe\udcff')
 
+    @support.bigmemtest(size=2**31, memuse=6, dry_run=False)
+    def test_large_utf8_input(self, size):
+        # Test input longer than INT_MAX.
+        # Input should contain a decodable multi-byte character
+        # surrounding INT_MAX
+        encoded = (b'0123456\xed\x84\x80' * (size//8))
+        self.assertEqual(len(encoded), size // 8 * 10)
+        decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
+        self.assertEqual(decoded[1], len(encoded))
+        del encoded
+        self.assertEqual(len(decoded[0]), size)
+        self.assertEqual(decoded[0][:10], '0123456\ud10001')
+        self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')
+
 
 class ASCIITest(unittest.TestCase):
     def test_encode(self):
diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-08-02-15-01-33.bpo-36311.uY5vt-.rst b/Misc/NEWS.d/next/Core and Builtins/2019-08-02-15-01-33.bpo-36311.uY5vt-.rst
new file mode 100644 (file)
index 0000000..c45f222
--- /dev/null
@@ -0,0 +1,2 @@
+Decoding bytes objects larger than 2GiB is faster and no longer fails when a
+multibyte characters spans a chunk boundary.
index 5545eae79505a380426f8947f886d3a8f1e266bc..aa933773233b587c12a0609495365165c6bc94fb 100644 (file)
@@ -7186,6 +7186,12 @@ PyUnicode_AsASCIIString(PyObject *unicode)
 #define NEED_RETRY
 #endif
 
+/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
+   transcoding from UTF-16), but INT_MAX / 4 perfoms better in
+   both cases also and avoids partial characters overrunning the
+   length limit in MultiByteToWideChar on Windows */
+#define DECODING_CHUNK_SIZE (INT_MAX/4)
+
 #ifndef WC_ERR_INVALID_CHARS
 #  define WC_ERR_INVALID_CHARS 0x0080
 #endif
@@ -7422,8 +7428,8 @@ decode_code_page_stateful(int code_page,
     do
     {
 #ifdef NEED_RETRY
-        if (size > INT_MAX) {
-            chunk_size = INT_MAX;
+        if (size > DECODING_CHUNK_SIZE) {
+            chunk_size = DECODING_CHUNK_SIZE;
             final = 0;
             done = 0;
         }
@@ -7827,10 +7833,8 @@ encode_code_page(int code_page,
     do
     {
 #ifdef NEED_RETRY
-        /* UTF-16 encoding may double the size, so use only INT_MAX/2
-           chunks. */
-        if (len > INT_MAX/2) {
-            chunk_len = INT_MAX/2;
+        if (len > DECODING_CHUNK_SIZE) {
+            chunk_len = DECODING_CHUNK_SIZE;
             done = 0;
         }
         else