]> granicus.if.org Git - python/commitdiff
Issue #15068: Got rid of excessive buffering in the fileinput module.
authorSerhiy Storchaka <storchaka@gmail.com>
Tue, 8 Mar 2016 16:28:36 +0000 (18:28 +0200)
committerSerhiy Storchaka <storchaka@gmail.com>
Tue, 8 Mar 2016 16:28:36 +0000 (18:28 +0200)
The bufsize parameter is no longer used.

Doc/library/fileinput.rst
Lib/fileinput.py
Lib/test/test_fileinput.py
Misc/NEWS

index ee06830ad8e7640d22ff9fd8b1c7866d54cbf4e1..9510f763325b9e4a10cb58e4612b5810af7cb3cb 100644 (file)
@@ -71,6 +71,9 @@ The following function is the primary interface of this module:
    .. versionchanged:: 3.2
       Can be used as a context manager.
 
+   .. versionchanged:: 3.5.2
+      The *bufsize* parameter is no longer used.
+
 
 The following functions use the global state created by :func:`fileinput.input`;
 if there is no active state, :exc:`RuntimeError` is raised.
@@ -161,7 +164,10 @@ available for subclassing as well:
       Can be used as a context manager.
 
    .. deprecated:: 3.4
-        The ``'rU'`` and ``'U'`` modes.
+      The ``'rU'`` and ``'U'`` modes.
+
+   .. versionchanged:: 3.5.2
+      The *bufsize* parameter is no longer used.
 
 
 **Optional in-place filtering:** if the keyword argument ``inplace=True`` is
index 021e39f83a370f5ae93eb9aaf4a30a7a685f92e3..9f14762db032cb14422a98bc4f8a7717faf2f557 100644 (file)
@@ -64,13 +64,6 @@ deleted when the output file is closed.  In-place filtering is
 disabled when standard input is read.  XXX The current implementation
 does not work for MS-DOS 8+3 filesystems.
 
-Performance: this module is unfortunately one of the slower ways of
-processing large numbers of input lines.  Nevertheless, a significant
-speed-up has been obtained by using readlines(bufsize) instead of
-readline().  A new keyword argument, bufsize=N, is present on the
-input() function and the FileInput() class to override the default
-buffer size.
-
 XXX Possible additions:
 
 - optional getopt argument processing
@@ -86,6 +79,7 @@ __all__ = ["input", "close", "nextfile", "filename", "lineno", "filelineno",
 
 _state = None
 
+# No longer used
 DEFAULT_BUFSIZE = 8*1024
 
 def input(files=None, inplace=False, backup="", bufsize=0,
@@ -207,17 +201,15 @@ class FileInput:
         self._files = files
         self._inplace = inplace
         self._backup = backup
-        self._bufsize = bufsize or DEFAULT_BUFSIZE
         self._savestdout = None
         self._output = None
         self._filename = None
-        self._lineno = 0
+        self._startlineno = 0
         self._filelineno = 0
         self._file = None
+        self._readline = self._start_readline
         self._isstdin = False
         self._backupfilename = None
-        self._buffer = []
-        self._bufindex = 0
         # restrict mode argument to reading modes
         if mode not in ('r', 'rU', 'U', 'rb'):
             raise ValueError("FileInput opening mode must be one of "
@@ -253,22 +245,18 @@ class FileInput:
         return self
 
     def __next__(self):
-        try:
-            line = self._buffer[self._bufindex]
-        except IndexError:
-            pass
-        else:
-            self._bufindex += 1
-            self._lineno += 1
+        line = self._readline()
+        if line:
             self._filelineno += 1
             return line
-        line = self.readline()
-        if not line:
+        if not self._file:
             raise StopIteration
-        return line
+        self.nextfile()
+        # Recursive call
+        return self.__next__()
 
     def __getitem__(self, i):
-        if i != self._lineno:
+        if i != self.lineno():
             raise RuntimeError("accessing lines out of order")
         try:
             return self.__next__()
@@ -289,6 +277,7 @@ class FileInput:
         finally:
             file = self._file
             self._file = None
+            self._readline = self._start_readline
             try:
                 if file and not self._isstdin:
                     file.close()
@@ -300,85 +289,81 @@ class FileInput:
                     except OSError: pass
 
                 self._isstdin = False
-                self._buffer = []
-                self._bufindex = 0
 
     def readline(self):
-        try:
-            line = self._buffer[self._bufindex]
-        except IndexError:
-            pass
+        while True:
+            line = self._readline()
+            if line:
+                self._filelineno += 1
+                return line
+            if not self._file:
+                return line
+            self.nextfile()
+            # repeat with next file
+
+    def _start_readline(self):
+        if not self._files:
+            if 'b' in self._mode:
+                return b''
+            else:
+                return ''
+        self._filename = self._files[0]
+        self._files = self._files[1:]
+        self._startlineno = self.lineno()
+        self._filelineno = 0
+        self._file = None
+        self._isstdin = False
+        self._backupfilename = 0
+        if self._filename == '-':
+            self._filename = '<stdin>'
+            if 'b' in self._mode:
+                self._file = getattr(sys.stdin, 'buffer', sys.stdin)
+            else:
+                self._file = sys.stdin
+            self._isstdin = True
         else:
-            self._bufindex += 1
-            self._lineno += 1
-            self._filelineno += 1
-            return line
-        if not self._file:
-            if not self._files:
-                if 'b' in self._mode:
-                    return b''
+            if self._inplace:
+                self._backupfilename = (
+                    self._filename + (self._backup or ".bak"))
+                try:
+                    os.unlink(self._backupfilename)
+                except OSError:
+                    pass
+                # The next few lines may raise OSError
+                os.rename(self._filename, self._backupfilename)
+                self._file = open(self._backupfilename, self._mode)
+                try:
+                    perm = os.fstat(self._file.fileno()).st_mode
+                except OSError:
+                    self._output = open(self._filename, "w")
                 else:
-                    return ''
-            self._filename = self._files[0]
-            self._files = self._files[1:]
-            self._filelineno = 0
-            self._file = None
-            self._isstdin = False
-            self._backupfilename = 0
-            if self._filename == '-':
-                self._filename = '<stdin>'
-                if 'b' in self._mode:
-                    self._file = getattr(sys.stdin, 'buffer', sys.stdin)
-                else:
-                    self._file = sys.stdin
-                self._isstdin = True
-            else:
-                if self._inplace:
-                    self._backupfilename = (
-                        self._filename + (self._backup or ".bak"))
+                    mode = os.O_CREAT | os.O_WRONLY | os.O_TRUNC
+                    if hasattr(os, 'O_BINARY'):
+                        mode |= os.O_BINARY
+
+                    fd = os.open(self._filename, mode, perm)
+                    self._output = os.fdopen(fd, "w")
                     try:
-                        os.unlink(self._backupfilename)
+                        if hasattr(os, 'chmod'):
+                            os.chmod(self._filename, perm)
                     except OSError:
                         pass
-                    # The next few lines may raise OSError
-                    os.rename(self._filename, self._backupfilename)
-                    self._file = open(self._backupfilename, self._mode)
-                    try:
-                        perm = os.fstat(self._file.fileno()).st_mode
-                    except OSError:
-                        self._output = open(self._filename, "w")
-                    else:
-                        mode = os.O_CREAT | os.O_WRONLY | os.O_TRUNC
-                        if hasattr(os, 'O_BINARY'):
-                            mode |= os.O_BINARY
-
-                        fd = os.open(self._filename, mode, perm)
-                        self._output = os.fdopen(fd, "w")
-                        try:
-                            if hasattr(os, 'chmod'):
-                                os.chmod(self._filename, perm)
-                        except OSError:
-                            pass
-                    self._savestdout = sys.stdout
-                    sys.stdout = self._output
+                self._savestdout = sys.stdout
+                sys.stdout = self._output
+            else:
+                # This may raise OSError
+                if self._openhook:
+                    self._file = self._openhook(self._filename, self._mode)
                 else:
-                    # This may raise OSError
-                    if self._openhook:
-                        self._file = self._openhook(self._filename, self._mode)
-                    else:
-                        self._file = open(self._filename, self._mode)
-        self._buffer = self._file.readlines(self._bufsize)
-        self._bufindex = 0
-        if not self._buffer:
-            self.nextfile()
-        # Recursive call
-        return self.readline()
+                    self._file = open(self._filename, self._mode)
+        self._readline = self._file.readline
+        return self._readline()
 
     def filename(self):
         return self._filename
 
     def lineno(self):
-        return self._lineno
+        return self._startlineno + self._filelineno
 
     def filelineno(self):
         return self._filelineno
index 91c11668bd6dbeb296c4a1d0b46a04dfbbeeca16..784bc92d23c80d6dfc8b550228fcbdfb8e5cbd7d 100644 (file)
@@ -46,6 +46,42 @@ def remove_tempfiles(*names):
         if name:
             safe_unlink(name)
 
+class LineReader:
+
+    def __init__(self):
+        self._linesread = []
+
+    @property
+    def linesread(self):
+        try:
+            return self._linesread[:]
+        finally:
+            self._linesread = []
+
+    def openhook(self, filename, mode):
+        self.it = iter(filename.splitlines(True))
+        return self
+
+    def readline(self, size=None):
+        line = next(self.it, '')
+        self._linesread.append(line)
+        return line
+
+    def readlines(self, hint=-1):
+        lines = []
+        size = 0
+        while True:
+            line = self.readline()
+            if not line:
+                return lines
+            lines.append(line)
+            size += len(line)
+            if size >= hint:
+                return lines
+
+    def close(self):
+        pass
+
 class BufferSizesTests(unittest.TestCase):
     def test_buffer_sizes(self):
         # First, run the tests with default and teeny buffer size.
@@ -289,7 +325,7 @@ class FileInputTests(unittest.TestCase):
         self.addCleanup(safe_unlink, TESTFN)
 
         with FileInput(files=TESTFN,
-                       openhook=hook_encoded('ascii'), bufsize=8) as fi:
+                       openhook=hook_encoded('ascii')) as fi:
             try:
                 self.assertEqual(fi.readline(), 'A\n')
                 self.assertEqual(fi.readline(), 'B\n')
@@ -457,6 +493,38 @@ class FileInputTests(unittest.TestCase):
 
         self.assertEqual(result, -1, "fileno() should return -1")
 
+    def test_readline_buffering(self):
+        src = LineReader()
+        with FileInput(files=['line1\nline2', 'line3\n'],
+                       openhook=src.openhook) as fi:
+            self.assertEqual(src.linesread, [])
+            self.assertEqual(fi.readline(), 'line1\n')
+            self.assertEqual(src.linesread, ['line1\n'])
+            self.assertEqual(fi.readline(), 'line2')
+            self.assertEqual(src.linesread, ['line2'])
+            self.assertEqual(fi.readline(), 'line3\n')
+            self.assertEqual(src.linesread, ['', 'line3\n'])
+            self.assertEqual(fi.readline(), '')
+            self.assertEqual(src.linesread, [''])
+            self.assertEqual(fi.readline(), '')
+            self.assertEqual(src.linesread, [])
+
+    def test_iteration_buffering(self):
+        src = LineReader()
+        with FileInput(files=['line1\nline2', 'line3\n'],
+                       openhook=src.openhook) as fi:
+            self.assertEqual(src.linesread, [])
+            self.assertEqual(next(fi), 'line1\n')
+            self.assertEqual(src.linesread, ['line1\n'])
+            self.assertEqual(next(fi), 'line2')
+            self.assertEqual(src.linesread, ['line2'])
+            self.assertEqual(next(fi), 'line3\n')
+            self.assertEqual(src.linesread, ['', 'line3\n'])
+            self.assertRaises(StopIteration, next, fi)
+            self.assertEqual(src.linesread, [''])
+            self.assertRaises(StopIteration, next, fi)
+            self.assertEqual(src.linesread, [])
+
 class MockFileInput:
     """A class that mocks out fileinput.FileInput for use during unit tests"""
 
index 3043456cdae4b9d3552ac1a303ca8a7ee5c940dc..e88938ca538e8c3e4182f8a7cb4b2e89a7361847 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -91,6 +91,9 @@ Core and Builtins
 Library
 -------
 
+- Issue #15068: Got rid of excessive buffering in the fileinput module.
+  The bufsize parameter is no longer used.
+
 - Issue #2202: Fix UnboundLocalError in
   AbstractDigestAuthHandler.get_algorithm_impls.  Initial patch by Mathieu Dupuy.