Issue #10335: Add tokenize.open(), detect the file encoding using

author Victor Stinner <victor.stinner@haypocalc.com>

Tue, 9 Nov 2010 01:08:59 +0000 (01:08 +0000)

committer Victor Stinner <victor.stinner@haypocalc.com>

Tue, 9 Nov 2010 01:08:59 +0000 (01:08 +0000)
author Victor Stinner <victor.stinner@haypocalc.com>
Tue, 9 Nov 2010 01:08:59 +0000 (01:08 +0000)
committer Victor Stinner <victor.stinner@haypocalc.com>
Tue, 9 Nov 2010 01:08:59 +0000 (01:08 +0000)
diff --git a/Doc/library/tokenize.rst b/Doc/library/tokenize.rst

index dbd01c478d594ba7740200229c64d0f336681343..6a96609596e40f76298e68f6f4d654af54f898fb 100644 (file)
--- a/Doc/library/tokenize.rst
+++ b/Doc/library/tokenize.rst
@@ -101,14 +101,16 @@ function it uses to do this is available:
      If no encoding is specified, then the default of ``'utf-8'`` will be
      returned.
  
-    :func:`detect_encoding` is useful for robustly reading Python source files.
-    A common pattern for this follows::
+    Use :func:`open` to open Python source files: it uses
+    :func:`detect_encoding` to detect the file encoding.
  
-        def read_python_source(file_name):
-            with open(file_name, "rb") as fp:
-                encoding = tokenize.detect_encoding(fp.readline)[0]
-            with open(file_name, "r", encoding=encoding) as fp:
-                return fp.read()
+
+.. function:: open(filename)
+
+   Open a file in read only mode using the encoding detected by
+   :func:`detect_encoding`.
+
+   .. versionadded:: 3.2
  
  
  Example of a script rewriter that transforms float literals into Decimal
@@ -153,4 +155,3 @@ objects::
                  result.append((toknum, tokval))
          return untokenize(result).decode('utf-8')
  
-
diff --git a/Lib/linecache.py b/Lib/linecache.py

index 974b1d965ab500ff46b10ac5378e5fa8a0400ffe..c3f2c3fdca4340c4f6781b264d39a47c2e0bb19a 100644 (file)
--- a/Lib/linecache.py
+++ b/Lib/linecache.py
@@ -123,9 +123,7 @@ def updatecache(filename, module_globals=None):
          else:
              return []
      try:
-        with open(fullname, 'rb') as fp:
-            coding, line = tokenize.detect_encoding(fp.readline)
-        with open(fullname, 'r', encoding=coding) as fp:
+        with tokenize.open(fullname) as fp:
              lines = fp.readlines()
      except IOError:
          return []
diff --git a/Lib/py_compile.py b/Lib/py_compile.py

index 111893efc53b944884b551e47cf7f987220be7b4..d241434a602eef99e29dac00373e846e3c3e614c 100644 (file)
--- a/Lib/py_compile.py
+++ b/Lib/py_compile.py
@@ -104,9 +104,7 @@ def compile(file, cfile=None, dfile=None, doraise=False):
      byte-compile all installed files (or all files in selected
      directories).
      """
-    with open(file, "rb") as f:
-        encoding = tokenize.detect_encoding(f.readline)[0]
-    with open(file, encoding=encoding) as f:
+    with tokenize.open(file) as f:
          try:
              timestamp = int(os.fstat(f.fileno()).st_mtime)
          except AttributeError:
diff --git a/Lib/tabnanny.py b/Lib/tabnanny.py

index 7053fd9398dbbd64b80fbd393d949ed41a9545b2..a4d4ef0da097540b7820df845cd54129575b15b6 100755 (executable)
--- a/Lib/tabnanny.py
+++ b/Lib/tabnanny.py
@@ -93,11 +93,8 @@ def check(file):
                  check(fullname)
          return
  
-    with open(file, 'rb') as f:
-        encoding, lines = tokenize.detect_encoding(f.readline)
-
      try:
-        f = open(file, encoding=encoding)
+        f = tokenize.open(file)
      except IOError as msg:
          errprint("%r: I/O Error: %s" % (file, msg))
          return
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 10e59b944255be342b331dc56b38146cecfca404..f98efcbb5397bc7a90f1d727bb5dad4b722ffce2 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -564,7 +564,8 @@ Non-ascii identifiers
  
  from test import support
  from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
-                     STRING, ENDMARKER, tok_name, detect_encoding)
+                     STRING, ENDMARKER, tok_name, detect_encoding,
+                     open as tokenize_open)
  from io import BytesIO
  from unittest import TestCase
  import os, sys, glob
@@ -857,6 +858,26 @@ class TestDetectEncoding(TestCase):
          readline = self.get_readline((b'# coding: bad\n',))
          self.assertRaises(SyntaxError, detect_encoding, readline)
  
+    def test_open(self):
+        filename = support.TESTFN + '.py'
+        self.addCleanup(support.unlink, filename)
+
+        # test coding cookie
+        for encoding in ('iso-8859-15', 'utf-8'):
+            with open(filename, 'w', encoding=encoding) as fp:
+                print("# coding: %s" % encoding, file=fp)
+                print("print('euro:\u20ac')", file=fp)
+            with tokenize_open(filename) as fp:
+                assert fp.encoding == encoding
+                assert fp.mode == 'r'
+
+        # test BOM (no coding cookie)
+        with open(filename, 'w', encoding='utf-8-sig') as fp:
+            print("print('euro:\u20ac')", file=fp)
+        with tokenize_open(filename) as fp:
+            assert fp.encoding == 'utf-8-sig'
+            assert fp.mode == 'r'
+
  class TestTokenize(TestCase):
  
      def test_tokenize(self):
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index eb58831cced338e829a0c3fd5b1fd5ccae91c261..7745412edfb589d2f3b4c5be085705151f141084 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -29,6 +29,7 @@ import sys
  from token import *
  from codecs import lookup, BOM_UTF8
  import collections
+from io import TextIOWrapper
  cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
  
  import token
@@ -335,6 +336,20 @@ def detect_encoding(readline):
      return default, [first, second]
  
  
+_builtin_open = open
+
+def open(filename):
+    """Open a file in read only mode using the encoding detected by
+    detect_encoding().
+    """
+    buffer = _builtin_open(filename, 'rb')
+    encoding, lines = detect_encoding(buffer.readline)
+    buffer.seek(0)
+    text = TextIOWrapper(buffer, encoding, line_buffering=True)
+    text.mode = 'r'
+    return text
+
+
  def tokenize(readline):
      """
      The tokenize() generator requires one argment, readline, which
diff --git a/Lib/trace.py b/Lib/trace.py

index 8ea4b898fbe5ef780ab654803fcb30dce3913658..b50aa02d0b95a2c34f5c1c80136dd5281b8b1150 100644 (file)
--- a/Lib/trace.py
+++ b/Lib/trace.py
@@ -432,10 +432,9 @@ def find_strings(filename, encoding=None):
  def find_executable_linenos(filename):
      """Return dict where keys are line numbers in the line number table."""
      try:
-        with io.FileIO(filename, 'r') as file:
-            encoding, lines = tokenize.detect_encoding(file.readline)
-        with open(filename, "r", encoding=encoding) as f:
+        with tokenize.open(filename) as f:
              prog = f.read()
+            encoding = f.encoding
      except IOError as err:
          print(("Not printing coverage data for %r: %s"
                                % (filename, err)), file=sys.stderr)
diff --git a/Misc/NEWS b/Misc/NEWS

index 5586118eb97914493ea70a935c3fe212ee0be751..48f952ef9c7b44a7ecdd6b27aa7df31a4dbc745e 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -60,6 +60,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #10335: Add tokenize.open(), detect the file encoding using
+  tokenize.detect_encoding() and open it in read only mode.
+
  - Issue #10321: Added support for binary data to smtplib.SMTP.sendmail,
    and a new method send_message to send an email.message.Message object.
author	Victor Stinner <victor.stinner@haypocalc.com>
	Tue, 9 Nov 2010 01:08:59 +0000 (01:08 +0000)
committer	Victor Stinner <victor.stinner@haypocalc.com>
	Tue, 9 Nov 2010 01:08:59 +0000 (01:08 +0000)
Doc/library/tokenize.rst		patch \| blob \| history
Lib/linecache.py		patch \| blob \| history
Lib/py_compile.py		patch \| blob \| history
Lib/tabnanny.py		patch \| blob \| history
Lib/test/test_tokenize.py		patch \| blob \| history
Lib/tokenize.py		patch \| blob \| history
Lib/trace.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history