From c1b75b5fb92fda0ac5b931d7b18c1418557cb7c4 Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Wed, 4 Jul 2018 01:32:41 -0700 Subject: [PATCH] bpo-34010: Fix tarfile read performance regression (GH-8020) During buffered read, use a list followed by join instead of extending a bytes object. This is how it was done before but changed in commit b506dc32c1a. (cherry picked from commit 12a08c47601cadea8e7d3808502cdbcca87b2ce2) Co-authored-by: hajoscher --- Lib/tarfile.py | 20 ++++++++++--------- .../2018-07-04-07-36-53.bpo-34010.VNDkde.rst | 2 ++ 2 files changed, 13 insertions(+), 9 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 85119a48a4..edd31e96fb 100755 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -532,7 +532,7 @@ class _Stream: if not buf: break t.append(buf) - buf = "".join(t) + buf = b"".join(t) else: buf = self._read(size) self.pos += len(buf) @@ -545,6 +545,7 @@ class _Stream: return self.__read(size) c = len(self.dbuf) + t = [self.dbuf] while c < size: buf = self.__read(self.bufsize) if not buf: @@ -553,26 +554,27 @@ class _Stream: buf = self.cmp.decompress(buf) except self.exception: raise ReadError("invalid compressed data") - self.dbuf += buf + t.append(buf) c += len(buf) - buf = self.dbuf[:size] - self.dbuf = self.dbuf[size:] - return buf + t = b"".join(t) + self.dbuf = t[size:] + return t[:size] def __read(self, size): """Return size bytes from stream. If internal buffer is empty, read another block from the stream. """ c = len(self.buf) + t = [self.buf] while c < size: buf = self.fileobj.read(self.bufsize) if not buf: break - self.buf += buf + t.append(buf) c += len(buf) - buf = self.buf[:size] - self.buf = self.buf[size:] - return buf + t = b"".join(t) + self.buf = t[size:] + return t[:size] # class _Stream class _StreamProxy(object): diff --git a/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst b/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst new file mode 100644 index 0000000000..4cb7892ee8 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst @@ -0,0 +1,2 @@ +Fixed a performance regression for reading streams with tarfile. The +buffered read should use a list, instead of appending to a bytes object. -- 2.50.1