]> granicus.if.org Git - python/commitdiff
Issue #19858: pickletools.optimize() now aware of the MEMOIZE opcode, can
authorSerhiy Storchaka <storchaka@gmail.com>
Tue, 16 Dec 2014 16:00:56 +0000 (18:00 +0200)
committerSerhiy Storchaka <storchaka@gmail.com>
Tue, 16 Dec 2014 16:00:56 +0000 (18:00 +0200)
produce more compact result and no longer produces invalid output if input
data contains MEMOIZE opcodes together with PUT or BINPUT opcodes.

Lib/pickletools.py
Lib/test/test_pickletools.py
Misc/NEWS

index 71c2aa1c79e8e157796ddc50537fa81fe1214482..6b86723a4c0d2dfbe06ef285d5a30ec00e21d625 100644 (file)
@@ -2282,40 +2282,61 @@ def genops(pickle):
 
 def optimize(p):
     'Optimize a pickle string by removing unused PUT opcodes'
-    not_a_put = object()
-    gets = { not_a_put }    # set of args used by a GET opcode
-    opcodes = []            # (startpos, stoppos, putid)
+    put = 'PUT'
+    get = 'GET'
+    oldids = set()          # set of all PUT ids
+    newids = {}             # set of ids used by a GET opcode
+    opcodes = []            # (op, idx) or (pos, end_pos)
     proto = 0
+    protoheader = b''
     for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True):
         if 'PUT' in opcode.name:
-            opcodes.append((pos, end_pos, arg))
+            oldids.add(arg)
+            opcodes.append((put, arg))
+        elif opcode.name == 'MEMOIZE':
+            idx = len(oldids)
+            oldids.add(idx)
+            opcodes.append((put, idx))
         elif 'FRAME' in opcode.name:
             pass
-        else:
-            if 'GET' in opcode.name:
-                gets.add(arg)
-            elif opcode.name == 'PROTO':
-                assert pos == 0, pos
+        elif 'GET' in opcode.name:
+            if opcode.proto > proto:
+                proto = opcode.proto
+            newids[arg] = None
+            opcodes.append((get, arg))
+        elif opcode.name == 'PROTO':
+            if arg > proto:
                 proto = arg
-            opcodes.append((pos, end_pos, not_a_put))
-            prevpos, prevarg = pos, None
+            if pos == 0:
+                protoheader = p[pos: end_pos]
+            else:
+                opcodes.append((pos, end_pos))
+        else:
+            opcodes.append((pos, end_pos))
+    del oldids
 
     # Copy the opcodes except for PUTS without a corresponding GET
     out = io.BytesIO()
-    opcodes = iter(opcodes)
-    if proto >= 2:
-        # Write the PROTO header before any framing
-        start, stop, _ = next(opcodes)
-        out.write(p[start:stop])
-    buf = pickle._Framer(out.write)
+    # Write the PROTO header before any framing
+    out.write(protoheader)
+    pickler = pickle._Pickler(out, proto)
     if proto >= 4:
-        buf.start_framing()
-    for start, stop, putid in opcodes:
-        if putid in gets:
-            buf.commit_frame()
-            buf.write(p[start:stop])
-    if proto >= 4:
-        buf.end_framing()
+        pickler.framer.start_framing()
+    idx = 0
+    for op, arg in opcodes:
+        if op is put:
+            if arg not in newids:
+                continue
+            data = pickler.put(idx)
+            newids[arg] = idx
+            idx += 1
+        elif op is get:
+            data = pickler.get(newids[arg])
+        else:
+            data = p[op:arg]
+        pickler.framer.commit_frame()
+        pickler.write(data)
+    pickler.framer.end_framing()
     return out.getvalue()
 
 ##############################################################################
index d37ac263c4c14f3f3bc645ab78446d93a75a3001..bbe6875545a7c2b4940e93add1c1929caf6937c1 100644 (file)
@@ -1,3 +1,4 @@
+import struct
 import pickle
 import pickletools
 from test import support
@@ -15,6 +16,48 @@ class OptimizedPickleTests(AbstractPickleTests, AbstractPickleModuleTests):
     # Test relies on precise output of dumps()
     test_pickle_to_2x = None
 
+    def test_optimize_long_binget(self):
+        data = [str(i) for i in range(257)]
+        data.append(data[-1])
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            pickled = pickle.dumps(data, proto)
+            unpickled = pickle.loads(pickled)
+            self.assertEqual(unpickled, data)
+            self.assertIs(unpickled[-1], unpickled[-2])
+
+            pickled2 = pickletools.optimize(pickled)
+            unpickled2 = pickle.loads(pickled2)
+            self.assertEqual(unpickled2, data)
+            self.assertIs(unpickled2[-1], unpickled2[-2])
+            self.assertNotIn(pickle.LONG_BINGET, pickled2)
+            self.assertNotIn(pickle.LONG_BINPUT, pickled2)
+
+    def test_optimize_binput_and_memoize(self):
+        pickled = (b'\x80\x04\x95\x15\x00\x00\x00\x00\x00\x00\x00'
+                   b']\x94(\x8c\x04spamq\x01\x8c\x03ham\x94h\x02e.')
+        #    0: \x80 PROTO      4
+        #    2: \x95 FRAME      21
+        #   11: ]    EMPTY_LIST
+        #   12: \x94 MEMOIZE
+        #   13: (    MARK
+        #   14: \x8c     SHORT_BINUNICODE 'spam'
+        #   20: q        BINPUT     1
+        #   22: \x8c     SHORT_BINUNICODE 'ham'
+        #   27: \x94     MEMOIZE
+        #   28: h        BINGET     2
+        #   30: e        APPENDS    (MARK at 13)
+        #   31: .    STOP
+        self.assertIn(pickle.BINPUT, pickled)
+        unpickled = pickle.loads(pickled)
+        self.assertEqual(unpickled, ['spam', 'ham', 'ham'])
+        self.assertIs(unpickled[1], unpickled[2])
+
+        pickled2 = pickletools.optimize(pickled)
+        unpickled2 = pickle.loads(pickled2)
+        self.assertEqual(unpickled2, ['spam', 'ham', 'ham'])
+        self.assertIs(unpickled2[1], unpickled2[2])
+        self.assertNotIn(pickle.BINPUT, pickled2)
+
 
 def test_main():
     support.run_unittest(OptimizedPickleTests)
index 93ae79cf7e2bf30a29883f6c7b965c9007eddddb..ae37fd1fb1575f3712127a6ffa332f5807629057 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -41,6 +41,10 @@ Core and Builtins
 Library
 -------
 
+- Issue #19858:  pickletools.optimize() now aware of the MEMOIZE opcode, can
+  produce more compact result and no longer produces invalid output if input
+  data contains MEMOIZE opcodes together with PUT or BINPUT opcodes.
+
 - Issue #22095: Fixed HTTPConnection.set_tunnel with default port.  The port
   value in the host header was set to "None".  Patch by Demian Brecht.