Issue #18960: Fix bugs with Python source code encoding in the second line.

author Serhiy Storchaka <storchaka@gmail.com>

Thu, 9 Jan 2014 16:36:09 +0000 (18:36 +0200)

committer Serhiy Storchaka <storchaka@gmail.com>

Thu, 9 Jan 2014 16:36:09 +0000 (18:36 +0200)
author Serhiy Storchaka <storchaka@gmail.com>
Thu, 9 Jan 2014 16:36:09 +0000 (18:36 +0200)
committer Serhiy Storchaka <storchaka@gmail.com>
Thu, 9 Jan 2014 16:36:09 +0000 (18:36 +0200)
diff --git a/Lib/idlelib/IOBinding.py b/Lib/idlelib/IOBinding.py

index cba80483a60332edf5a2413cdf82de74cd6d2d5e..f008b46799e80d373415d986cf7ef188d714c6d3 100644 (file)
--- a/Lib/idlelib/IOBinding.py
+++ b/Lib/idlelib/IOBinding.py
@@ -64,6 +64,7 @@ encoding = locale_encoding  ### KBK 07Sep07  This is used all over IDLE, check!
                              ### 'encoding' is used below in encode(), check!
  
  coding_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
+blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
  
  def coding_spec(data):
      """Return the encoding declaration according to PEP 263.
@@ -93,6 +94,8 @@ def coding_spec(data):
          match = coding_re.match(line)
          if match is not None:
              break
+        if not blank_re.match(line):
+            return None
      else:
          return None
      name = match.group(1)
diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py

index b7c646129cd168a741b8c01583cf95d9150035bb..1bb931e9db3127d35ac0f2cee15d839f57e996d6 100644 (file)
--- a/Lib/lib2to3/pgen2/tokenize.py
+++ b/Lib/lib2to3/pgen2/tokenize.py
@@ -237,6 +237,7 @@ class Untokenizer:
              toks_append(tokval)
  
  cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
+blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
  
  def _get_normal_name(orig_enc):
      """Imitates get_normal_name in tokenizer.c."""
@@ -309,6 +310,8 @@ def detect_encoding(readline):
      encoding = find_cookie(first)
      if encoding:
          return encoding, [first]
+    if not blank_re.match(first):
+        return default, [first]
  
      second = read_or_stop()
      if not second:
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 17650855eb33b5a702be8c614d3ecf7101a98fa5..6ed859707f8917332f0d4d9e0ffd295124997843 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -885,6 +885,39 @@ class TestDetectEncoding(TestCase):
          readline = self.get_readline(lines)
          self.assertRaises(SyntaxError, detect_encoding, readline)
  
+    def test_cookie_second_line_noncommented_first_line(self):
+        lines = (
+            b"print('\xc2\xa3')\n",
+            b'# vim: set fileencoding=iso8859-15 :\n',
+            b"print('\xe2\x82\xac')\n"
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'utf-8')
+        expected = [b"print('\xc2\xa3')\n"]
+        self.assertEqual(consumed_lines, expected)
+
+    def test_cookie_second_line_commented_first_line(self):
+        lines = (
+            b"#print('\xc2\xa3')\n",
+            b'# vim: set fileencoding=iso8859-15 :\n',
+            b"print('\xe2\x82\xac')\n"
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso8859-15')
+        expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
+        self.assertEqual(consumed_lines, expected)
+
+    def test_cookie_second_line_empty_first_line(self):
+        lines = (
+            b'\n',
+            b'# vim: set fileencoding=iso8859-15 :\n',
+            b"print('\xe2\x82\xac')\n"
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso8859-15')
+        expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
+        self.assertEqual(consumed_lines, expected)
+
      def test_latin1_normalization(self):
          # See get_normal_name() in tokenizer.c.
          encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index d0609e85451e4ebf56c8ae5f5f7d61e64fe7f683..294bf9a068a3aabf15982a39f3255f2d27446a3c 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -32,6 +32,7 @@ from codecs import lookup, BOM_UTF8
  import collections
  from io import TextIOWrapper
  cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
+blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
  
  import token
  __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
@@ -409,6 +410,8 @@ def detect_encoding(readline):
      encoding = find_cookie(first)
      if encoding:
          return encoding, [first]
+    if not blank_re.match(first):
+        return default, [first]
  
      second = read_or_stop()
      if not second:
diff --git a/Misc/NEWS b/Misc/NEWS

index afe8514fa70141721bc56197d9c51384f6850dfc..6bf5a3091f3a133e2d0ef34455e8c3e7f9940233 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,13 @@ What's New in Python 3.3.4 release candidate 1?
  Core and Builtins
  -----------------
  
+- Issue #18960: The first line of Python script could be executed twice when
+  the source encoding was specified on the second line.  Now the source encoding
+  declaration on the second line isn't effective if the first line contains
+  anything except a comment.  'python -x' works now again with files with the
+  source encoding declarations, and can be used to make Python batch files
+  on Windows.
+
  - Issue #19081: When a zipimport .zip file in sys.path being imported from
    is modified during the lifetime of the Python process after zipimport has
    already cached the zip's table of contents we detect this and recover
@@ -36,6 +43,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #18960: The tokenize module now ignore the source encoding declaration
+  on the second line if the first line contains anything except a comment.
+
  - Issue #20078: Reading malformed zipfiles no longer hangs with 100% CPU
    consumption.
  
@@ -204,6 +214,9 @@ Library
  IDLE
  ----
  
+- Issue #18960: IDLE now ignores the source encoding declaration on the second
+  line if the first line contains anything except a comment.
+
  - Issue #20058: sys.stdin.readline() in IDLE now always returns only one line.
  
  - Issue #19481: print() of string subclass instance in IDLE no longer hangs.
@@ -281,6 +294,13 @@ Build
  - Add workaround for VS 2010 nmake clean issue. VS 2010 doesn't set up PATH
    for nmake.exe correctly.
  
+Tools/Demos
+-----------
+
+- Issue #18960: 2to3 and the findnocoding.py script now ignore the source
+  encoding declaration on the second line if the first line contains anything
+  except a comment.
+
  
  What's New in Python 3.3.3?
  ===========================
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index 5c0bd6eb8a12a2aec78434db38d7a3d655984622..a69d7880da7ab2bf7d3de66bb1595dbb5ae80140 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -283,13 +283,27 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
      char *cs;
      int r = 1;
  
-    if (tok->cont_line)
+    if (tok->cont_line) {
          /* It's a continuation line, so it can't be a coding spec. */
+        tok->read_coding_spec = 1;
          return 1;
+    }
      if (!get_coding_spec(line, &cs, size, tok))
          return 0;
-    if (!cs)
+    if (!cs) {
+        Py_ssize_t i;
+        for (i = 0; i < size; i++) {
+            if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
+                break;
+            if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
+                /* Stop checking coding spec after a line containing
+                 * anything except a comment. */
+                tok->read_coding_spec = 1;
+                break;
+            }
+        }
          return 1;
+    }
      tok->read_coding_spec = 1;
      if (tok->encoding == NULL) {
          assert(tok->decoding_state == STATE_RAW);
@@ -476,13 +490,17 @@ fp_setreadl(struct tok_state *tok, const char* enc)
      _Py_IDENTIFIER(open);
      _Py_IDENTIFIER(readline);
      int fd;
+    long pos;
  
      io = PyImport_ImportModuleNoBlock("io");
      if (io == NULL)
          goto cleanup;
  
      fd = fileno(tok->fp);
-    if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
+    /* Due to buffering the file offset for fd can be different from the file
+     * position of tok->fp. */
+    pos = ftell(tok->fp);
+    if (pos == -1 || lseek(fd, (off_t)pos, SEEK_SET) == (off_t)-1) {
          PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
          goto cleanup;
      }
@@ -751,7 +769,7 @@ decode_str(const char *input, int single, struct tok_state *tok)
      if (newl[0]) {
          if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
              return error_ret(tok);
-        if (tok->enc == NULL && newl[1]) {
+        if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
              if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
                                     tok, buf_setreadl))
                  return error_ret(tok);
diff --git a/Tools/scripts/findnocoding.py b/Tools/scripts/findnocoding.py

index c0997d6598e44c11ecb147d06984644e0e2cea86..5f3795e65754dbcfbf70c2a0028532628b5b75f8 100755 (executable)
--- a/Tools/scripts/findnocoding.py
+++ b/Tools/scripts/findnocoding.py
@@ -33,6 +33,7 @@ except ImportError:
  
  
  decl_re = re.compile(rb'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)')
+blank_re = re.compile(rb'^[ \t\f]*(?:[#\r\n]|$)')
  
  def get_declaration(line):
      match = decl_re.match(line)
@@ -58,7 +59,8 @@ def needs_declaration(fullpath):
          line1 = infile.readline()
          line2 = infile.readline()
  
-        if get_declaration(line1) or get_declaration(line2):
+        if (get_declaration(line1) or
+            blank_re.match(line1) and get_declaration(line2)):
              # the file does have an encoding declaration, so trust it
              return False
author	Serhiy Storchaka <storchaka@gmail.com>
	Thu, 9 Jan 2014 16:36:09 +0000 (18:36 +0200)
committer	Serhiy Storchaka <storchaka@gmail.com>
	Thu, 9 Jan 2014 16:36:09 +0000 (18:36 +0200)
Lib/idlelib/IOBinding.py		patch \| blob \| history
Lib/lib2to3/pgen2/tokenize.py		patch \| blob \| history
Lib/test/test_tokenize.py		patch \| blob \| history
Lib/tokenize.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Parser/tokenizer.c		patch \| blob \| history
Tools/scripts/findnocoding.py		patch \| blob \| history