]> granicus.if.org Git - python/commitdiff
Issue #15068: Got rid of excessive buffering in the fileinput module.
authorSerhiy Storchaka <storchaka@gmail.com>
Tue, 8 Mar 2016 16:35:45 +0000 (18:35 +0200)
committerSerhiy Storchaka <storchaka@gmail.com>
Tue, 8 Mar 2016 16:35:45 +0000 (18:35 +0200)
The bufsize parameter is no longer used.

Doc/library/fileinput.rst
Lib/fileinput.py
Lib/test/test_fileinput.py
Misc/NEWS

index 710bef39c6088ae97bd8ac6dee1bc0ed23957c3f..a2ffeff83be6c33d79aac26a1e2b2cdd27313833 100644 (file)
@@ -60,6 +60,9 @@ The following function is the primary interface of this module:
    .. versionchanged:: 2.5
       Added the *mode* and *openhook* parameters.
 
+   .. versionchanged:: 2.7.12
+      The *bufsize* parameter is no longer used.
+
 The following functions use the global state created by :func:`fileinput.input`;
 if there is no active state, :exc:`RuntimeError` is raised.
 
@@ -143,6 +146,9 @@ available for subclassing as well:
    .. versionchanged:: 2.5
       Added the *mode* and *openhook* parameters.
 
+   .. versionchanged:: 2.7.12
+      The *bufsize* parameter is no longer used.
+
 **Optional in-place filtering:** if the keyword argument ``inplace=1`` is passed
 to :func:`fileinput.input` or to the :class:`FileInput` constructor, the file is
 moved to a backup file and standard output is directed to the input file (if a
index 02dc4c61273f284ab9989f92dc80ec1a95e071ef..f54632ce65e1b1ad424cfc3bfbacdf3cfd697165 100644 (file)
@@ -64,13 +64,6 @@ deleted when the output file is closed.  In-place filtering is
 disabled when standard input is read.  XXX The current implementation
 does not work for MS-DOS 8+3 filesystems.
 
-Performance: this module is unfortunately one of the slower ways of
-processing large numbers of input lines.  Nevertheless, a significant
-speed-up has been obtained by using readlines(bufsize) instead of
-readline().  A new keyword argument, bufsize=N, is present on the
-input() function and the FileInput() class to override the default
-buffer size.
-
 XXX Possible additions:
 
 - optional getopt argument processing
@@ -86,6 +79,7 @@ __all__ = ["input","close","nextfile","filename","lineno","filelineno",
 
 _state = None
 
+# No longer used
 DEFAULT_BUFSIZE = 8*1024
 
 def input(files=None, inplace=0, backup="", bufsize=0,
@@ -207,17 +201,15 @@ class FileInput:
         self._files = files
         self._inplace = inplace
         self._backup = backup
-        self._bufsize = bufsize or DEFAULT_BUFSIZE
         self._savestdout = None
         self._output = None
         self._filename = None
-        self._lineno = 0
+        self._startlineno = 0
         self._filelineno = 0
         self._file = None
+        self._readline = self._start_readline
         self._isstdin = False
         self._backupfilename = None
-        self._buffer = []
-        self._bufindex = 0
         # restrict mode argument to reading modes
         if mode not in ('r', 'rU', 'U', 'rb'):
             raise ValueError("FileInput opening mode must be one of "
@@ -242,22 +234,18 @@ class FileInput:
         return self
 
     def next(self):
-        try:
-            line = self._buffer[self._bufindex]
-        except IndexError:
-            pass
-        else:
-            self._bufindex += 1
-            self._lineno += 1
+        line = self._readline()
+        if line:
             self._filelineno += 1
             return line
-        line = self.readline()
-        if not line:
+        if not self._file:
             raise StopIteration
-        return line
+        self.nextfile()
+        # Recursive call
+        return self.next()
 
     def __getitem__(self, i):
-        if i != self._lineno:
+        if i != self.lineno():
             raise RuntimeError, "accessing lines out of order"
         try:
             return self.next()
@@ -277,7 +265,8 @@ class FileInput:
                 output.close()
         finally:
             file = self._file
-            self._file = 0
+            self._file = None
+            self._readline = self._start_readline
             try:
                 if file and not self._isstdin:
                     file.close()
@@ -289,75 +278,72 @@ class FileInput:
                     except OSError: pass
 
                 self._isstdin = False
-                self._buffer = []
-                self._bufindex = 0
 
     def readline(self):
-        try:
-            line = self._buffer[self._bufindex]
-        except IndexError:
-            pass
+        while 1:
+            line = self._readline()
+            if line:
+                self._filelineno += 1
+                return line
+            if not self._file:
+                return line
+            self.nextfile()
+            # repeat with next file
+
+    def _start_readline(self):
+        if not self._files:
+            return ""
+        self._filename = self._files[0]
+        self._files = self._files[1:]
+        self._startlineno = self.lineno()
+        self._filelineno = 0
+        self._file = None
+        self._isstdin = False
+        self._backupfilename = 0
+        if self._filename == '-':
+            self._filename = '<stdin>'
+            self._file = sys.stdin
+            self._isstdin = True
         else:
-            self._bufindex += 1
-            self._lineno += 1
-            self._filelineno += 1
-            return line
-        if not self._file:
-            if not self._files:
-                return ""
-            self._filename = self._files[0]
-            self._files = self._files[1:]
-            self._filelineno = 0
-            self._file = None
-            self._isstdin = False
-            self._backupfilename = 0
-            if self._filename == '-':
-                self._filename = '<stdin>'
-                self._file = sys.stdin
-                self._isstdin = True
-            else:
-                if self._inplace:
-                    self._backupfilename = (
-                        self._filename + (self._backup or os.extsep+"bak"))
-                    try: os.unlink(self._backupfilename)
-                    except os.error: pass
-                    # The next few lines may raise IOError
-                    os.rename(self._filename, self._backupfilename)
-                    self._file = open(self._backupfilename, self._mode)
+            if self._inplace:
+                self._backupfilename = (
+                    self._filename + (self._backup or os.extsep+"bak"))
+                try: os.unlink(self._backupfilename)
+                except os.error: pass
+                # The next few lines may raise IOError
+                os.rename(self._filename, self._backupfilename)
+                self._file = open(self._backupfilename, self._mode)
+                try:
+                    perm = os.fstat(self._file.fileno()).st_mode
+                except OSError:
+                    self._output = open(self._filename, "w")
+                else:
+                    fd = os.open(self._filename,
+                                    os.O_CREAT | os.O_WRONLY | os.O_TRUNC,
+                                    perm)
+                    self._output = os.fdopen(fd, "w")
                     try:
-                        perm = os.fstat(self._file.fileno()).st_mode
+                        if hasattr(os, 'chmod'):
+                            os.chmod(self._filename, perm)
                     except OSError:
-                        self._output = open(self._filename, "w")
-                    else:
-                        fd = os.open(self._filename,
-                                     os.O_CREAT | os.O_WRONLY | os.O_TRUNC,
-                                     perm)
-                        self._output = os.fdopen(fd, "w")
-                        try:
-                            if hasattr(os, 'chmod'):
-                                os.chmod(self._filename, perm)
-                        except OSError:
-                            pass
-                    self._savestdout = sys.stdout
-                    sys.stdout = self._output
+                        pass
+                self._savestdout = sys.stdout
+                sys.stdout = self._output
+            else:
+                # This may raise IOError
+                if self._openhook:
+                    self._file = self._openhook(self._filename, self._mode)
                 else:
-                    # This may raise IOError
-                    if self._openhook:
-                        self._file = self._openhook(self._filename, self._mode)
-                    else:
-                        self._file = open(self._filename, self._mode)
-        self._buffer = self._file.readlines(self._bufsize)
-        self._bufindex = 0
-        if not self._buffer:
-            self.nextfile()
-        # Recursive call
-        return self.readline()
+                    self._file = open(self._filename, self._mode)
+
+        self._readline = self._file.readline
+        return self._readline()
 
     def filename(self):
         return self._filename
 
     def lineno(self):
-        return self._lineno
+        return self._startlineno + self._filelineno
 
     def filelineno(self):
         return self._filelineno
index facc56e24f66e4fcb5b337fa6464fe532a2c30a4..a6f09949ffd8ba6a98946e3ffd314a6dd88d830e 100644 (file)
@@ -5,7 +5,7 @@ Nick Mathewson
 
 import unittest
 from test.test_support import verbose, TESTFN, run_unittest
-from test.test_support import unlink as safe_unlink
+from test.test_support import unlink as safe_unlink, check_warnings
 import sys, re
 from StringIO import StringIO
 from fileinput import FileInput, hook_encoded
@@ -28,6 +28,42 @@ def remove_tempfiles(*names):
     for name in names:
         safe_unlink(name)
 
+class LineReader:
+
+    def __init__(self):
+        self._linesread = []
+
+    @property
+    def linesread(self):
+        try:
+            return self._linesread[:]
+        finally:
+            self._linesread = []
+
+    def openhook(self, filename, mode):
+        self.it = iter(filename.splitlines(True))
+        return self
+
+    def readline(self, size=None):
+        line = next(self.it, '')
+        self._linesread.append(line)
+        return line
+
+    def readlines(self, hint=-1):
+        lines = []
+        size = 0
+        while True:
+            line = self.readline()
+            if not line:
+                return lines
+            lines.append(line)
+            size += len(line)
+            if size >= hint:
+                return lines
+
+    def close(self):
+        pass
+
 class BufferSizesTests(unittest.TestCase):
     def test_buffer_sizes(self):
         # First, run the tests with default and teeny buffer size.
@@ -228,7 +264,7 @@ class FileInputTests(unittest.TestCase):
             f.write('\x80')
         self.addCleanup(safe_unlink, TESTFN)
 
-        fi = FileInput(files=TESTFN, openhook=hook_encoded('ascii'), bufsize=8)
+        fi = FileInput(files=TESTFN, openhook=hook_encoded('ascii'))
         # The most likely failure is a UnicodeDecodeError due to the entire
         # file being read when it shouldn't have been.
         self.assertEqual(fi.readline(), u'A\n')
@@ -239,6 +275,38 @@ class FileInputTests(unittest.TestCase):
             list(fi)
         fi.close()
 
+    def test_readline_buffering(self):
+        src = LineReader()
+        fi = FileInput(files=['line1\nline2', 'line3\n'], openhook=src.openhook)
+        self.assertEqual(src.linesread, [])
+        self.assertEqual(fi.readline(), 'line1\n')
+        self.assertEqual(src.linesread, ['line1\n'])
+        self.assertEqual(fi.readline(), 'line2')
+        self.assertEqual(src.linesread, ['line2'])
+        self.assertEqual(fi.readline(), 'line3\n')
+        self.assertEqual(src.linesread, ['', 'line3\n'])
+        self.assertEqual(fi.readline(), '')
+        self.assertEqual(src.linesread, [''])
+        self.assertEqual(fi.readline(), '')
+        self.assertEqual(src.linesread, [])
+        fi.close()
+
+    def test_iteration_buffering(self):
+        src = LineReader()
+        fi = FileInput(files=['line1\nline2', 'line3\n'], openhook=src.openhook)
+        self.assertEqual(src.linesread, [])
+        self.assertEqual(next(fi), 'line1\n')
+        self.assertEqual(src.linesread, ['line1\n'])
+        self.assertEqual(next(fi), 'line2')
+        self.assertEqual(src.linesread, ['line2'])
+        self.assertEqual(next(fi), 'line3\n')
+        self.assertEqual(src.linesread, ['', 'line3\n'])
+        self.assertRaises(StopIteration, next, fi)
+        self.assertEqual(src.linesread, [''])
+        self.assertRaises(StopIteration, next, fi)
+        self.assertEqual(src.linesread, [])
+        fi.close()
+
 class Test_hook_encoded(unittest.TestCase):
     """Unit tests for fileinput.hook_encoded()"""
 
index 71cc689143c70e7a78f68177d53077ceaa278d46..2afab1cb9ba211b48213b557097c48f16a8f7d62 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -58,6 +58,9 @@ Core and Builtins
 Library
 -------
 
+- Issue #15068: Got rid of excessive buffering in the fileinput module.
+  The bufsize parameter is no longer used.
+
 - Issue #2202: Fix UnboundLocalError in
   AbstractDigestAuthHandler.get_algorithm_impls.  Initial patch by Mathieu Dupuy.