normalize latin-1 and utf-8 variant encodings like the builtin tokenizer does

author Benjamin Peterson <benjamin@python.org>

Fri, 9 Oct 2009 21:43:09 +0000 (21:43 +0000)

committer Benjamin Peterson <benjamin@python.org>

Fri, 9 Oct 2009 21:43:09 +0000 (21:43 +0000)
author Benjamin Peterson <benjamin@python.org>
Fri, 9 Oct 2009 21:43:09 +0000 (21:43 +0000)
committer Benjamin Peterson <benjamin@python.org>
Fri, 9 Oct 2009 21:43:09 +0000 (21:43 +0000)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index f395ed43c9999ae2013b42c3994ff6ec22f4c187..ba705bac7ac61d579d511bae159cddc7b1f51756 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -719,7 +719,7 @@ class TestDetectEncoding(TestCase):
              b'do_something(else)\n'
          )
          encoding, consumed_lines = detect_encoding(self.get_readline(lines))
-        self.assertEquals(encoding, 'latin-1')
+        self.assertEquals(encoding, 'iso-8859-1')
          self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
  
      def test_matched_bom_and_cookie_first_line(self):
@@ -775,6 +775,34 @@ class TestDetectEncoding(TestCase):
          readline = self.get_readline(lines)
          self.assertRaises(SyntaxError, detect_encoding, readline)
  
+    def test_latin1_normalization(self):
+        # See get_normal_name() in tokenizer.c.
+        encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
+                     "iso-8859-1-unix", "iso-latin-1-mac")
+        for encoding in encodings:
+            for rep in ("-", "_"):
+                enc = encoding.replace("-", rep)
+                lines = (b"#!/usr/bin/python\n",
+                         b"# coding: " + enc.encode("ascii") + b"\n",
+                         b"print(things)\n",
+                         b"do_something += 4\n")
+                rl = self.get_readline(lines)
+                found, consumed_lines = detect_encoding(rl)
+                self.assertEquals(found, "iso-8859-1")
+
+    def test_utf8_normalization(self):
+        # See get_normal_name() in tokenizer.c.
+        encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
+        for encoding in encodings:
+            for rep in ("-", "_"):
+                enc = encoding.replace("-", rep)
+                lines = (b"#!/usr/bin/python\n",
+                         b"# coding: " + enc.encode("ascii") + b"\n",
+                         b"1 + 3\n")
+                rl = self.get_readline(lines)
+                found, consumed_lines = detect_encoding(rl)
+                self.assertEquals(found, "utf-8")
+
      def test_short_files(self):
          readline = self.get_readline((b'print(something)\n',))
          encoding, consumed_lines = detect_encoding(readline)
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index f83bda522a665d16bc67db26187c0ff3f9cdad5c..fb58c6b77a06d740671ea926ee705bea23ffa645 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -279,6 +279,17 @@ def untokenize(iterable):
      return out
  
  
+def _get_normal_name(orig_enc):
+    """Imitates get_normal_name in tokenizer.c."""
+    # Only care about the first 12 characters.
+    enc = orig_enc[:12].lower().replace("_", "-")
+    if enc == "utf-8" or enc.startswith("utf-8-"):
+        return "utf-8"
+    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
+       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+        return "iso-8859-1"
+    return orig_enc
+
  def detect_encoding(readline):
      """
      The detect_encoding() function is used to detect the encoding that should
@@ -313,7 +324,7 @@ def detect_encoding(readline):
          matches = cookie_re.findall(line_string)
          if not matches:
              return None
-        encoding = matches[0]
+        encoding = _get_normal_name(matches[0])
          try:
              codec = lookup(encoding)
          except LookupError:
diff --git a/Misc/NEWS b/Misc/NEWS

index 61f91ed0cafc07ebbc82dd2d17809ae712284679..f542bcb30fd87ea940a5587a2d64b77562495314 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -87,6 +87,9 @@ C-API
  Library
  -------
  
+- Make tokenize.detect_coding() normalize utf-8 and iso-8859-1 variants like the
+  builtin tokenizer.
+
  - Issue #7048: Force Decimal.logb to round its result when that result
    is too large to fit in the current precision.
author	Benjamin Peterson <benjamin@python.org>
	Fri, 9 Oct 2009 21:43:09 +0000 (21:43 +0000)
committer	Benjamin Peterson <benjamin@python.org>
	Fri, 9 Oct 2009 21:43:09 +0000 (21:43 +0000)
Lib/test/test_tokenize.py		patch \| blob \| history
Lib/tokenize.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history