bpo-34010: Fix tarfile read performance regression (GH-8020)

author hajoscher <hajoscher@gmail.com>

Wed, 4 Jul 2018 08:13:18 +0000 (10:13 +0200)

committer INADA Naoki <methane@users.noreply.github.com>

Wed, 4 Jul 2018 08:13:18 +0000 (17:13 +0900)
author hajoscher <hajoscher@gmail.com>
Wed, 4 Jul 2018 08:13:18 +0000 (10:13 +0200)
committer INADA Naoki <methane@users.noreply.github.com>
Wed, 4 Jul 2018 08:13:18 +0000 (17:13 +0900)
diff --git a/Lib/tarfile.py b/Lib/tarfile.py

index 7b4732d47197a8667a562e11399af31265529f2f..59f044cc5a00dcb798a686b8e8d8308147196134 100755 (executable)
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -525,7 +525,7 @@ class _Stream:
                  if not buf:
                      break
                  t.append(buf)
-            buf = "".join(t)
+            buf = b"".join(t)
          else:
              buf = self._read(size)
          self.pos += len(buf)
@@ -538,6 +538,7 @@ class _Stream:
              return self.__read(size)
  
          c = len(self.dbuf)
+        t = [self.dbuf]
          while c < size:
              buf = self.__read(self.bufsize)
              if not buf:
@@ -546,26 +547,27 @@ class _Stream:
                  buf = self.cmp.decompress(buf)
              except self.exception:
                  raise ReadError("invalid compressed data")
-            self.dbuf += buf
+            t.append(buf)
              c += len(buf)
-        buf = self.dbuf[:size]
-        self.dbuf = self.dbuf[size:]
-        return buf
+        t = b"".join(t)
+        self.dbuf = t[size:]
+        return t[:size]
  
      def __read(self, size):
          """Return size bytes from stream. If internal buffer is empty,
             read another block from the stream.
          """
          c = len(self.buf)
+        t = [self.buf]
          while c < size:
              buf = self.fileobj.read(self.bufsize)
              if not buf:
                  break
-            self.buf += buf
+            t.append(buf)
              c += len(buf)
-        buf = self.buf[:size]
-        self.buf = self.buf[size:]
-        return buf
+        t = b"".join(t)
+        self.buf = t[size:]
+        return t[:size]
  # class _Stream
  
  class _StreamProxy(object):
diff --git a/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst b/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst

new file mode 100644 (file)

index 0000000..4cb7892
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst
@@ -0,0 +1,2 @@
+Fixed a performance regression for reading streams with tarfile. The
+buffered read should use a list, instead of appending to a bytes object.
author	hajoscher <hajoscher@gmail.com>
	Wed, 4 Jul 2018 08:13:18 +0000 (10:13 +0200)
committer	INADA Naoki <methane@users.noreply.github.com>
	Wed, 4 Jul 2018 08:13:18 +0000 (17:13 +0900)
Lib/tarfile.py		patch \| blob \| history
Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst	[new file with mode: 0644]	patch \| blob