Issue #14629: Raise SyntaxError in tokenizer.detect_encoding

author Martin v. Löwis <martin@v.loewis.de>

Fri, 20 Apr 2012 12:36:47 +0000 (14:36 +0200)

committer Martin v. Löwis <martin@v.loewis.de>

Fri, 20 Apr 2012 12:36:47 +0000 (14:36 +0200)
author Martin v. Löwis <martin@v.loewis.de>
Fri, 20 Apr 2012 12:36:47 +0000 (14:36 +0200)
committer Martin v. Löwis <martin@v.loewis.de>
Fri, 20 Apr 2012 12:36:47 +0000 (14:36 +0200)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 9e9656ced60a46938dd1b1381226c9af944eb020..63d084df731babedc2a0c057c26f9ba717c85fa8 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -825,6 +825,16 @@ class TestDetectEncoding(TestCase):
                  found, consumed_lines = detect_encoding(rl)
                  self.assertEqual(found, "iso-8859-1")
  
+    def test_syntaxerror_latin1(self):
+        # Issue 14629: need to raise SyntaxError if the first
+        # line(s) have non-UTF-8 characters
+        lines = (
+            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
+            )
+        readline = self.get_readline(lines)
+        self.assertRaises(SyntaxError, detect_encoding, readline)
+
+
      def test_utf8_normalization(self):
          # See get_normal_name() in tokenizer.c.
          encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index f575e9bc237a9258254c29391889cae3e4137572..f283c6dd7f9e486a79e573e91f715264748064f2 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -292,9 +292,12 @@ def detect_encoding(readline):
  
      def find_cookie(line):
          try:
-            line_string = line.decode('ascii')
+            # Decode as UTF-8. Either the line is an encoding declaration,
+            # in which case it should be pure ASCII, or it must be UTF-8
+            # per default encoding.
+            line_string = line.decode('utf-8')
          except UnicodeDecodeError:
-            return None
+            raise SyntaxError("invalid or missing encoding declaration")
  
          matches = cookie_re.findall(line_string)
          if not matches:
diff --git a/Misc/NEWS b/Misc/NEWS

index 47d7b1097e077bde2bc3a815149398162ce8a5c8..addae1b54c6523b3cb7183189f7ec8edd328258b 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -47,6 +47,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #14629: Raise SyntaxError in tokenizer.detect_encoding if the
+  first two lines have non-UTF-8 characters without an encoding declaration.
+
  - Issue #14308: Fix an exception when a "dummy" thread is in the threading
    module's active list after a fork().
author	Martin v. Löwis <martin@v.loewis.de>
	Fri, 20 Apr 2012 12:36:47 +0000 (14:36 +0200)
committer	Martin v. Löwis <martin@v.loewis.de>
	Fri, 20 Apr 2012 12:36:47 +0000 (14:36 +0200)
Lib/test/test_tokenize.py		patch \| blob \| history
Lib/tokenize.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history