From 9f6cbe09cc88be914600306b34ac3d0025738465 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Lars=20Gust=C3=A4bel?= Date: Wed, 23 Feb 2011 11:52:31 +0000 Subject: [PATCH] Merged revisions 88528 via svnmerge from svn+ssh://pythondev@svn.python.org/python/branches/py3k ........ r88528 | lars.gustaebel | 2011-02-23 12:42:22 +0100 (Wed, 23 Feb 2011) | 16 lines Issue #11224: Improved sparse file read support (r85916) introduced a regression in _FileInFile which is used in file-like objects returned by TarFile.extractfile(). The inefficient design of the _FileInFile.read() method causes various dramatic side-effects and errors: - The data segment of a file member is read completely into memory every(!) time a small block is accessed. This is not only slow but may cause unexpected MemoryErrors with very large files. - Reading members from compressed tar archives is even slower because of the excessive backwards seeking which is done when the same data segment is read over and over again. - As a backwards seek on a TarFile opened in stream mode is not possible, using extractfile() fails with a StreamError. ........ --- Lib/tarfile.py | 5 ++--- Lib/test/test_tarfile.py | 16 ++++++++++++++++ Misc/NEWS | 4 ++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index e3747e9c79..0f9d1dade1 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -760,9 +760,8 @@ class _FileInFile(object): self.map_index = 0 length = min(size, stop - self.position) if data: - self.fileobj.seek(offset) - block = self.fileobj.read(stop - start) - buf += block[self.position - start:self.position + length] + self.fileobj.seek(offset + (self.position - start)) + buf += self.fileobj.read(length) else: buf += NUL * length size -= length diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 94ef61c0ce..68e094d5db 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -419,6 +419,22 @@ class StreamReadTest(CommonReadTest): mode="r|" + def test_read_through(self): + # Issue #11224: A poorly designed _FileInFile.read() method + # caused seeking errors with stream tar files. + for tarinfo in self.tar: + if not tarinfo.isreg(): + continue + fobj = self.tar.extractfile(tarinfo) + while True: + try: + buf = fobj.read(512) + except tarfile.StreamError: + self.fail("simple read-through using TarFile.extractfile() failed") + if not buf: + break + fobj.close() + def test_fileobj_regular_file(self): tarinfo = self.tar.next() # get "regtype" (can't use getmember) fobj = self.tar.extractfile(tarinfo) diff --git a/Misc/NEWS b/Misc/NEWS index a5ac7c104e..7488686a97 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -15,6 +15,10 @@ Core and Builtins Library ------- +- Issue #11224: Fixed a regression in tarfile that affected the file-like + objects returned by TarFile.extractfile() regarding performance, memory + consumption and failures with the stream interface. + - Issue #11074: Make 'tokenize' so it can be reloaded. - Issue #4681: Allow mmap() to work on file sizes and offsets larger than -- 2.40.0