raise a SyntaxError in detect_encoding() when a codec lookup fails like the builtin...

author Benjamin Peterson <benjamin@python.org>

Fri, 12 Dec 2008 01:25:05 +0000 (01:25 +0000)

committer Benjamin Peterson <benjamin@python.org>

Fri, 12 Dec 2008 01:25:05 +0000 (01:25 +0000)
author Benjamin Peterson <benjamin@python.org>
Fri, 12 Dec 2008 01:25:05 +0000 (01:25 +0000)
committer Benjamin Peterson <benjamin@python.org>
Fri, 12 Dec 2008 01:25:05 +0000 (01:25 +0000)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 8fbd216ac66fde8f46565d0f932a005548823d18..75a7a809b4fde23a1e3d06379943b5086f04c088 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -795,6 +795,8 @@ class TestDetectEncoding(TestCase):
          self.assertEquals(encoding, 'utf-8')
          self.assertEquals(consumed_lines, [])
  
+        readline = self.get_readline((b'# coding: bad\n',))
+        self.assertRaises(SyntaxError, detect_encoding, readline)
  
  class TestTokenize(TestCase):
  
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index ec5a79a6453b0866f85eea6c702e1d3e47984769..16c4f3f029830e24020f251762912d62b8afd61d 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -26,7 +26,7 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
  
  import re, string, sys
  from token import *
-from codecs import lookup
+from codecs import lookup, BOM_UTF8
  from itertools import chain, repeat
  cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
  
@@ -251,11 +251,11 @@ def detect_encoding(readline):
  
      It detects the encoding from the presence of a utf-8 bom or an encoding
      cookie as specified in pep-0263. If both a bom and a cookie are present,
-    but disagree, a SyntaxError will be raised.
+    but disagree, a SyntaxError will be raised. If the encoding cookie is an
+    invalid charset, raise a SyntaxError.
  
      If no encoding is specified, then the default of 'utf-8' will be returned.
      """
-    utf8_bom = b'\xef\xbb\xbf'
      bom_found = False
      encoding = None
      def read_or_stop():
@@ -268,18 +268,25 @@ def detect_encoding(readline):
          try:
              line_string = line.decode('ascii')
          except UnicodeDecodeError:
-            pass
-        else:
-            matches = cookie_re.findall(line_string)
-            if matches:
-                encoding = matches[0]
-                if bom_found and lookup(encoding).name != 'utf-8':
-                    # This behaviour mimics the Python interpreter
-                    raise SyntaxError('encoding problem: utf-8')
-                return encoding
+            return None
+
+        matches = cookie_re.findall(line_string)
+        if not matches:
+            return None
+        encoding = matches[0]
+        try:
+            codec = lookup(encoding)
+        except LookupError:
+            # This behaviour mimics the Python interpreter
+            raise SyntaxError("unknown encoding: " + encoding)
+
+        if bom_found and codec.name != 'utf-8':
+            # This behaviour mimics the Python interpreter
+            raise SyntaxError('encoding problem: utf-8')
+        return encoding
  
      first = read_or_stop()
-    if first.startswith(utf8_bom):
+    if first.startswith(BOM_UTF8):
          bom_found = True
          first = first[3:]
      if not first:
diff --git a/Misc/NEWS b/Misc/NEWS

index 10a074599066af2d0b2f13d783c02cc4177f483a..6568a1c5a38766abd48c240a945dc8a2c18980b4 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -45,6 +45,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #4021: tokenize.detect_encoding() now raises a SyntaxError when the
+  codec cannot be found.  This is for compatibility with the builtin behavior.
+
  - Issue #4084: Fix max, min, max_mag and min_mag Decimal methods to
    give correct results in the case where one argument is a quiet NaN
    and the other is a finite number that requires rounding.
author	Benjamin Peterson <benjamin@python.org>
	Fri, 12 Dec 2008 01:25:05 +0000 (01:25 +0000)
committer	Benjamin Peterson <benjamin@python.org>
	Fri, 12 Dec 2008 01:25:05 +0000 (01:25 +0000)
Lib/test/test_tokenize.py		patch \| blob \| history
Lib/tokenize.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history