Merged revisions 77288 via svnmerge from

author Antoine Pitrou <solipsis@pitrou.net>

Sun, 3 Jan 2010 22:37:40 +0000 (22:37 +0000)

committer Antoine Pitrou <solipsis@pitrou.net>

Sun, 3 Jan 2010 22:37:40 +0000 (22:37 +0000)
author Antoine Pitrou <solipsis@pitrou.net>
Sun, 3 Jan 2010 22:37:40 +0000 (22:37 +0000)
committer Antoine Pitrou <solipsis@pitrou.net>
Sun, 3 Jan 2010 22:37:40 +0000 (22:37 +0000)
diff --git a/Lib/gzip.py b/Lib/gzip.py

index f9a59d7ff0aef800021803ef472420324b319f9e..66fc88daa0d442c1253e1479d067171c6bcd0140 100644 (file)
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@@ -8,6 +8,7 @@ but random access is not allowed."""
  import struct, sys, time, os
  import zlib
  import builtins
+import io
  
  __all__ = ["GzipFile","open"]
  
@@ -44,7 +45,7 @@ def open(filename, mode="rb", compresslevel=9):
      """
      return GzipFile(filename, mode, compresslevel)
  
-class GzipFile:
+class GzipFile(io.BufferedIOBase):
      """The GzipFile class simulates most of the methods of a file object with
      the exception of the readinto() and truncate() methods.
  
@@ -109,8 +110,12 @@ class GzipFile:
              self.mode = READ
              # Set flag indicating start of a new member
              self._new_member = True
+            # Buffer data read from gzip file. extrastart is offset in
+            # stream where buffer starts. extrasize is number of
+            # bytes remaining in buffer from current stream position.
              self.extrabuf = b""
              self.extrasize = 0
+            self.extrastart = 0
              self.name = filename
              # Starts small, scales exponentially
              self.min_readsize = 100
@@ -214,7 +219,6 @@ class GzipFile:
          if flag & FHCRC:
              self.fileobj.read(2)     # Read & discard the 16-bit header CRC
  
-
      def write(self,data):
          if self.mode != WRITE:
              import errno
@@ -222,12 +226,19 @@ class GzipFile:
  
          if self.fileobj is None:
              raise ValueError("write() on closed GzipFile object")
+
+        # Convert data type if called by io.BufferedWriter.
+        if isinstance(data, memoryview):
+            data = data.tobytes()
+
          if len(data) > 0:
              self.size = self.size + len(data)
              self.crc = zlib.crc32(data, self.crc) & 0xffffffff
              self.fileobj.write( self.compress.compress(data) )
              self.offset += len(data)
  
+        return len(data)
+
      def read(self, size=-1):
          if self.mode != READ:
              import errno
@@ -253,15 +264,14 @@ class GzipFile:
                  if size > self.extrasize:
                      size = self.extrasize
  
-        chunk = self.extrabuf[:size]
-        self.extrabuf = self.extrabuf[size:]
+        offset = self.offset - self.extrastart
+        chunk = self.extrabuf[offset: offset + size]
          self.extrasize = self.extrasize - size
  
          self.offset += size
          return chunk
  
      def _unread(self, buf):
-        self.extrabuf = buf + self.extrabuf
          self.extrasize = len(buf) + self.extrasize
          self.offset -= len(buf)
  
@@ -317,8 +327,10 @@ class GzipFile:
  
      def _add_read_data(self, data):
          self.crc = zlib.crc32(data, self.crc) & 0xffffffff
-        self.extrabuf = self.extrabuf + data
+        offset = self.offset - self.extrastart
+        self.extrabuf = self.extrabuf[offset:] + data
          self.extrasize = self.extrasize + len(data)
+        self.extrastart = self.offset
          self.size = self.size + len(data)
  
      def _read_eof(self):
@@ -336,6 +348,10 @@ class GzipFile:
          elif isize != (self.size & 0xffffffff):
              raise IOError("Incorrect length of data produced")
  
+    @property
+    def closed(self):
+        return self.fileobj is None
+
      def close(self):
          if self.fileobj is None:
              return
@@ -351,15 +367,6 @@ class GzipFile:
              self.myfileobj.close()
              self.myfileobj = None
  
-    def __del__(self):
-        try:
-            if (self.myfileobj is None and
-                self.fileobj is None):
-                return
-        except AttributeError:
-            return
-        self.close()
-
      def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
          if self.mode == WRITE:
              # Ensure the compressor's buffer is flushed
@@ -374,12 +381,6 @@ class GzipFile:
          """
          return self.fileobj.fileno()
  
-    def isatty(self):
-        return False
-
-    def tell(self):
-        return self.offset
-
      def rewind(self):
          '''Return the uncompressed stream file position indicator to the
          beginning of the file'''
@@ -389,8 +390,18 @@ class GzipFile:
          self._new_member = True
          self.extrabuf = b""
          self.extrasize = 0
+        self.extrastart = 0
          self.offset = 0
  
+    def readable(self):
+        return self.mode == READ
+
+    def writable(self):
+        return self.mode == WRITE
+
+    def seekable(self):
+        return True
+
      def seek(self, offset, whence=0):
          if whence:
              if whence == 1:
@@ -414,8 +425,18 @@ class GzipFile:
                  self.read(1024)
              self.read(count % 1024)
  
+        return self.offset
+
      def readline(self, size=-1):
          if size < 0:
+            # Shortcut common case - newline found in buffer.
+            offset = self.offset - self.extrastart
+            i = self.extrabuf.find(b'\n', offset) + 1
+            if i > 0:
+                self.extrasize -= i - offset
+                self.offset += i - offset
+                return self.extrabuf[offset: i]
+
              size = sys.maxsize
              readsize = self.min_readsize
          else:
@@ -445,42 +466,6 @@ class GzipFile:
              self.min_readsize = min(readsize, self.min_readsize * 2, 512)
          return b''.join(bufs) # Return resulting line
  
-    def readlines(self, sizehint=0):
-        # Negative numbers result in reading all the lines
-        if sizehint <= 0:
-            sizehint = sys.maxsize
-        L = []
-        while sizehint > 0:
-            line = self.readline()
-            if line == b"":
-                break
-            L.append(line)
-            sizehint = sizehint - len(line)
-
-        return L
-
-    def writelines(self, L):
-        for line in L:
-            self.write(line)
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        line = self.readline()
-        if line:
-            return line
-        else:
-            raise StopIteration
-
-    def __enter__(self):
-        if self.fileobj is None:
-            raise ValueError("I/O operation on closed GzipFile object")
-        return self
-
-    def __exit__(self, *args):
-        self.close()
-
  
  def _test():
      # Act like gzip; with -d, act like gunzip.
diff --git a/Lib/test/test_gzip.py b/Lib/test/test_gzip.py

index fa91dc0262394e721afe5b3cd2c0cf3bf8d5e23b..320adfda8a0c2f7c8461b15f8b0f4076500889a2 100644 (file)
--- a/Lib/test/test_gzip.py
+++ b/Lib/test/test_gzip.py
@@ -5,6 +5,7 @@
  import unittest
  from test import support
  import os
+import io
  import struct
  gzip = support.import_module('gzip')
  
@@ -80,6 +81,16 @@ class TestGzip(unittest.TestCase):
          zgfile.close()
          self.assertEquals(contents, b'a'*201)
  
+    def test_buffered_reader(self):
+        # Issue #7471: a GzipFile can be wrapped in a BufferedReader for
+        # performance.
+        self.test_write()
+
+        f = gzip.GzipFile(self.filename, 'rb')
+        with io.BufferedReader(f) as r:
+            lines = [line for line in r]
+
+        self.assertEqual(lines, 50 * data1.splitlines(True))
  
      def test_readline(self):
          self.test_write()
diff --git a/Misc/NEWS b/Misc/NEWS

index 451a2a06d4afe0abf078874edfe8f1d68fb943bd..815e3922b9c3b06f05837f9cf1590a37c233763a 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -191,7 +191,11 @@ C-API
  Library
  -------
  
-_ Issue #3972: http.client.HTTPConnection now accepts an optional source_address
+- Issue #7471: Improve the performance of GzipFile's buffering mechanism,
+  and make it implement the `io.BufferedIOBase` ABC to allow for further
+  speedups by wrapping it in an `io.BufferedReader`.  Patch by Nir Aides.
+
+- Issue #3972: http.client.HTTPConnection now accepts an optional source_address
    parameter to allow specifying where your connections come from.
  
  - socket.create_connection now accepts an optional source_address parameter.
author	Antoine Pitrou <solipsis@pitrou.net>
	Sun, 3 Jan 2010 22:37:40 +0000 (22:37 +0000)
committer	Antoine Pitrou <solipsis@pitrou.net>
	Sun, 3 Jan 2010 22:37:40 +0000 (22:37 +0000)
Lib/gzip.py		patch \| blob \| history
Lib/test/test_gzip.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history