bpo-31993: Do not use memoryview when pickle large strings. (#5154)

author Serhiy Storchaka <storchaka@gmail.com>

Fri, 12 Jan 2018 22:28:31 +0000 (00:28 +0200)

committer GitHub <noreply@github.com>

Fri, 12 Jan 2018 22:28:31 +0000 (00:28 +0200)
author Serhiy Storchaka <storchaka@gmail.com>
Fri, 12 Jan 2018 22:28:31 +0000 (00:28 +0200)
committer GitHub <noreply@github.com>
Fri, 12 Jan 2018 22:28:31 +0000 (00:28 +0200)
diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py

index f4e3f81249c86ed27d0dc23c00f80a9afc9c849d..062ec5ae72bfac5be0e03fc3fa2ee70f9fe4afe4 100644 (file)
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -2169,67 +2169,67 @@ class AbstractPickleTests(unittest.TestCase):
      def test_framed_write_sizes_with_delayed_writer(self):
          class ChunkAccumulator:
              """Accumulate pickler output in a list of raw chunks."""
-
              def __init__(self):
                  self.chunks = []
-
              def write(self, chunk):
                  self.chunks.append(chunk)
-
              def concatenate_chunks(self):
-                # Some chunks can be memoryview instances, we need to convert
-                # them to bytes to be able to call join
-                return b"".join([c.tobytes() if hasattr(c, 'tobytes') else c
-                                 for c in self.chunks])
-
-        small_objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)})
-                         for i in range(int(1e4))]
+                return b"".join(self.chunks)
  
          for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
+            objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)})
+                       for i in range(int(1e4))]
+            # Add a large unique ASCII string
+            objects.append('0123456789abcdef' *
+                           (self.FRAME_SIZE_TARGET // 16 + 1))
+
              # Protocol 4 packs groups of small objects into frames and issues
              # calls to write only once or twice per frame:
              # The C pickler issues one call to write per-frame (header and
              # contents) while Python pickler issues two calls to write: one for
              # the frame header and one for the frame binary contents.
              writer = ChunkAccumulator()
-            self.pickler(writer, proto).dump(small_objects)
+            self.pickler(writer, proto).dump(objects)
  
              # Actually read the binary content of the chunks after the end
-            # of the call to dump: ant memoryview passed to write should not
+            # of the call to dump: any memoryview passed to write should not
              # be released otherwise this delayed access would not be possible.
              pickled = writer.concatenate_chunks()
              reconstructed = self.loads(pickled)
-            self.assertEqual(reconstructed, small_objects)
+            self.assertEqual(reconstructed, objects)
              self.assertGreater(len(writer.chunks), 1)
  
-            n_frames, remainder = divmod(len(pickled), self.FRAME_SIZE_TARGET)
-            if remainder > 0:
-                n_frames += 1
+            # memoryviews should own the memory.
+            del objects
+            support.gc_collect()
+            self.assertEqual(writer.concatenate_chunks(), pickled)
  
+            n_frames = (len(pickled) - 1) // self.FRAME_SIZE_TARGET + 1
              # There should be at least one call to write per frame
              self.assertGreaterEqual(len(writer.chunks), n_frames)
  
              # but not too many either: there can be one for the proto,
-            # one per-frame header and one per frame for the actual contents.
-            self.assertGreaterEqual(2 * n_frames + 1, len(writer.chunks))
+            # one per-frame header, one per frame for the actual contents,
+            # and two for the header.
+            self.assertLessEqual(len(writer.chunks), 2 * n_frames + 3)
  
-            chunk_sizes = [len(c) for c in writer.chunks[:-1]]
+            chunk_sizes = [len(c) for c in writer.chunks]
              large_sizes = [s for s in chunk_sizes
                             if s >= self.FRAME_SIZE_TARGET]
-            small_sizes = [s for s in chunk_sizes
-                           if s < self.FRAME_SIZE_TARGET]
+            medium_sizes = [s for s in chunk_sizes
+                           if 9 < s < self.FRAME_SIZE_TARGET]
+            small_sizes = [s for s in chunk_sizes if s <= 9]
  
              # Large chunks should not be too large:
              for chunk_size in large_sizes:
-                self.assertGreater(2 * self.FRAME_SIZE_TARGET, chunk_size)
-
-            last_chunk_size = len(writer.chunks[-1])
-            self.assertGreater(2 * self.FRAME_SIZE_TARGET, last_chunk_size)
-
-            # Small chunks (if any) should be very small
-            # (only proto and frame headers)
-            for chunk_size in small_sizes:
-                self.assertGreaterEqual(9, chunk_size)
+                self.assertLess(chunk_size, 2 * self.FRAME_SIZE_TARGET,
+                                chunk_sizes)
+            # There shouldn't bee too many small chunks: the protocol header,
+            # the frame headers and the large string headers are written
+            # in small chunks.
+            self.assertLessEqual(len(small_sizes),
+                                 len(large_sizes) + len(medium_sizes) + 3,
+                                 chunk_sizes)
  
      def test_nested_names(self):
          global Nested
diff --git a/Misc/NEWS.d/3.7.0a4.rst b/Misc/NEWS.d/3.7.0a4.rst

index ca9aa171bfa81e20e1e500876fd0311c9befab2c..a6322ed405bac381b9c37edfe558a445c5d18724 100644 (file)
--- a/Misc/NEWS.d/3.7.0a4.rst
+++ b/Misc/NEWS.d/3.7.0a4.rst
@@ -693,9 +693,9 @@ ctypes._aix.find_library() Patch by: Michael Felt
  .. nonce: -OMNg8
  .. section: Library
  
-The picklers no longer allocate temporary memory when dumping large
-``bytes`` and ``str`` objects into a file object. Instead the data is
-directly streamed into the underlying file object.
+The pickler now uses less memory when serializing large bytes and str
+objects into a file.  Pickles created with protocol 4 will require less
+memory for unpickling large bytes and str objects.
  
  ..
  
diff --git a/Modules/_pickle.c b/Modules/_pickle.c

index 5b3de4d96dda02d31e38de7a907ae0463a73c23f..f8cbea12e809584a9d7655371ad3c688e51f6f3f 100644 (file)
--- a/Modules/_pickle.c
+++ b/Modules/_pickle.c
@@ -2184,8 +2184,9 @@ _Pickler_write_bytes(PicklerObject *self,
          /* Stream write the payload into the file without going through the
             output buffer. */
          if (payload == NULL) {
-            payload = mem = PyMemoryView_FromMemory((char *) data, data_size,
-                                                    PyBUF_READ);
+            /* TODO: It would be better to use a memoryview with a linked
+               original string if this is possible. */
+            payload = mem = PyBytes_FromStringAndSize(data, data_size);
              if (payload == NULL) {
                  return -1;
              }
author	Serhiy Storchaka <storchaka@gmail.com>
	Fri, 12 Jan 2018 22:28:31 +0000 (00:28 +0200)
committer	GitHub <noreply@github.com>
	Fri, 12 Jan 2018 22:28:31 +0000 (00:28 +0200)
Lib/test/pickletester.py		patch \| blob \| history
Misc/NEWS.d/3.7.0a4.rst		patch \| blob \| history
Modules/_pickle.c		patch \| blob \| history