Latin-1 source code was not being properly decoded when passed through

author Brett Cannon <bcannon@gmail.com>

Fri, 17 Oct 2008 03:38:50 +0000 (03:38 +0000)

committer Brett Cannon <bcannon@gmail.com>

Fri, 17 Oct 2008 03:38:50 +0000 (03:38 +0000)
author Brett Cannon <bcannon@gmail.com>
Fri, 17 Oct 2008 03:38:50 +0000 (03:38 +0000)
committer Brett Cannon <bcannon@gmail.com>
Fri, 17 Oct 2008 03:38:50 +0000 (03:38 +0000)
diff --git a/Lib/test/test_pep3120.py b/Lib/test/test_pep3120.py

index 3bb30ca76144a9671a812144a1759b48c2bad885..81d15bc8f40ce6a80643758dafdf4debfadee690 100644 (file)
--- a/Lib/test/test_pep3120.py
+++ b/Lib/test/test_pep3120.py
@@ -23,8 +23,24 @@ class PEP3120Test(unittest.TestCase):
          else:
              self.fail("expected exception didn't occur")
  
+
+class BuiltinCompileTests(unittest.TestCase):
+
+    # Issue 3574.
+    def test_latin1(self):
+        # Allow compile() to read Latin-1 source.
+        source_code = '# coding: Latin-1\nu = "Ç"\n'.encode("Latin-1")
+        try:
+            code = compile(source_code, '<dummy>', 'exec')
+        except SyntaxError:
+            self.fail("compile() cannot handle Latin-1 source")
+        ns = {}
+        exec(code, ns)
+        self.assertEqual('Ç', ns['u'])
+
+
  def test_main():
-    support.run_unittest(PEP3120Test)
+    support.run_unittest(PEP3120Test, BuiltinCompileTests)
  
  if __name__=="__main__":
      test_main()
diff --git a/Misc/NEWS b/Misc/NEWS

index 0f47afbab9b847743234daba6d62196645d82cd8..ede8e520973d473b4862cee9c615788456aca279 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -15,6 +15,8 @@ What's New in Python 3.0 beta 5
  Core and Builtins
  -----------------
  
+- Issue #3574: compile() incorrectly handled source code encoded as Latin-1.
+
  - Issues #2384 and #3975: Tracebacks were not correctly printed when the
    source file contains a ``coding:`` header: the wrong line was displayed, and
    the encoding was not respected.
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index 4edf6d07e44ece96717e436cc5e0486d38de22c9..ce8129ddd123f0406ef9e88c932a4e24813489e8 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -135,6 +135,7 @@ tok_new(void)
         tok->decoding_state = STATE_INIT;
         tok->decoding_erred = 0;
         tok->read_coding_spec = 0;
+       tok->enc = NULL;
         tok->encoding = NULL;
          tok->cont_line = 0;
  #ifndef PGEN
@@ -274,8 +275,7 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
                 tok->read_coding_spec = 1;
                 if (tok->encoding == NULL) {
                         assert(tok->decoding_state == STATE_RAW);
-                       if (strcmp(cs, "utf-8") == 0 ||
-                           strcmp(cs, "iso-8859-1") == 0) {
+                       if (strcmp(cs, "utf-8") == 0) {
                                 tok->encoding = cs;
                         } else {
                                 r = set_readline(tok, cs);
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h

index c45dea1a8b0b87b1025da238629198cc0a6e6df4..df9cbc74b94569f64ca9af57fe3a6a90ddf73663 100644 (file)
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -49,14 +49,14 @@ struct tok_state {
         enum decoding_state decoding_state;
         int decoding_erred;     /* whether erred in decoding  */
         int read_coding_spec;   /* whether 'coding:...' has been read  */
-       char *encoding;
+       char *encoding;         /* Source encoding. */
         int cont_line;          /* whether we are in a continuation line. */
         const char* line_start; /* pointer to start of current line */
  #ifndef PGEN
         PyObject *decoding_readline; /* codecs.open(...).readline */
         PyObject *decoding_buffer;
  #endif
-       const char* enc;
+       const char* enc;        /* Encoding for the current str. */
         const char* str;
  };
  
diff --git a/Python/ast.c b/Python/ast.c

index 6d2fa09b17dd34e8fbed4cdb35cc58ef8a716330..60906a1f569d310119009d9dd273c12ef2834ed1 100644 (file)
--- a/Python/ast.c
+++ b/Python/ast.c
@@ -3160,9 +3160,6 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
      if (encoding == NULL) {
          buf = (char *)s;
          u = NULL;
-    } else if (strcmp(encoding, "iso-8859-1") == 0) {
-        buf = (char *)s;
-        u = NULL;
      } else {
          /* check for integer overflow */
          if (len > PY_SIZE_MAX / 4)
@@ -3275,8 +3272,7 @@ parsestr(struct compiling *c, const node *n, int *bytesmode)
          }
      }
      need_encoding = (!*bytesmode && c->c_encoding != NULL &&
-                     strcmp(c->c_encoding, "utf-8") != 0 &&
-                     strcmp(c->c_encoding, "iso-8859-1") != 0);
+                     strcmp(c->c_encoding, "utf-8") != 0);
      if (rawmode || strchr(s, '\\') == NULL) {
          if (need_encoding) {
              PyObject *v, *u = PyUnicode_DecodeUTF8(s, len, NULL);
author	Brett Cannon <bcannon@gmail.com>
	Fri, 17 Oct 2008 03:38:50 +0000 (03:38 +0000)
committer	Brett Cannon <bcannon@gmail.com>
	Fri, 17 Oct 2008 03:38:50 +0000 (03:38 +0000)
Lib/test/test_pep3120.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Parser/tokenizer.c		patch \| blob \| history
Parser/tokenizer.h		patch \| blob \| history
Python/ast.c		patch \| blob \| history