]> granicus.if.org Git - python/commitdiff
bpo-34010: Fix tarfile read performance regression (GH-8020)
authorhajoscher <hajoscher@gmail.com>
Wed, 4 Jul 2018 08:13:18 +0000 (10:13 +0200)
committerINADA Naoki <methane@users.noreply.github.com>
Wed, 4 Jul 2018 08:13:18 +0000 (17:13 +0900)
During buffered read, use a list followed by join instead of extending a bytes object.
This is how it was done before but changed in commit b506dc32c1a.

Lib/tarfile.py
Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst [new file with mode: 0644]

index 7b4732d47197a8667a562e11399af31265529f2f..59f044cc5a00dcb798a686b8e8d8308147196134 100755 (executable)
@@ -525,7 +525,7 @@ class _Stream:
                 if not buf:
                     break
                 t.append(buf)
-            buf = "".join(t)
+            buf = b"".join(t)
         else:
             buf = self._read(size)
         self.pos += len(buf)
@@ -538,6 +538,7 @@ class _Stream:
             return self.__read(size)
 
         c = len(self.dbuf)
+        t = [self.dbuf]
         while c < size:
             buf = self.__read(self.bufsize)
             if not buf:
@@ -546,26 +547,27 @@ class _Stream:
                 buf = self.cmp.decompress(buf)
             except self.exception:
                 raise ReadError("invalid compressed data")
-            self.dbuf += buf
+            t.append(buf)
             c += len(buf)
-        buf = self.dbuf[:size]
-        self.dbuf = self.dbuf[size:]
-        return buf
+        t = b"".join(t)
+        self.dbuf = t[size:]
+        return t[:size]
 
     def __read(self, size):
         """Return size bytes from stream. If internal buffer is empty,
            read another block from the stream.
         """
         c = len(self.buf)
+        t = [self.buf]
         while c < size:
             buf = self.fileobj.read(self.bufsize)
             if not buf:
                 break
-            self.buf += buf
+            t.append(buf)
             c += len(buf)
-        buf = self.buf[:size]
-        self.buf = self.buf[size:]
-        return buf
+        t = b"".join(t)
+        self.buf = t[size:]
+        return t[:size]
 # class _Stream
 
 class _StreamProxy(object):
diff --git a/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst b/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst
new file mode 100644 (file)
index 0000000..4cb7892
--- /dev/null
@@ -0,0 +1,2 @@
+Fixed a performance regression for reading streams with tarfile. The
+buffered read should use a list, instead of appending to a bytes object.