Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error.

author Victor Stinner <victor.stinner@haypocalc.com>

Mon, 4 Apr 2011 23:48:03 +0000 (01:48 +0200)

committer Victor Stinner <victor.stinner@haypocalc.com>

Mon, 4 Apr 2011 23:48:03 +0000 (01:48 +0200)
author Victor Stinner <victor.stinner@haypocalc.com>
Mon, 4 Apr 2011 23:48:03 +0000 (01:48 +0200)
committer Victor Stinner <victor.stinner@haypocalc.com>
Mon, 4 Apr 2011 23:48:03 +0000 (01:48 +0200)
diff --git a/Lib/test/test_imp.py b/Lib/test/test_imp.py

index 83e17d3d3e043030244e921372abf91cfd43bb61..88d2a3e9291d1363f3eff0178bcc850d80b5e129 100644 (file)
--- a/Lib/test/test_imp.py
+++ b/Lib/test/test_imp.py
@@ -58,6 +58,12 @@ class ImportTests(unittest.TestCase):
              with imp.find_module('module_' + mod, self.test_path)[0] as fd:
                  self.assertEqual(fd.encoding, encoding)
  
+        path = [os.path.dirname(__file__)]
+        self.assertRaisesRegex(SyntaxError,
+            r"Non-UTF-8 code starting with '\\xf6'"
+            r" in file .*badsyntax_pep3120.py",
+            imp.find_module, 'badsyntax_pep3120', path)
+
      def test_issue1267(self):
          for mod, encoding, _ in self.test_strings:
              fp, filename, info  = imp.find_module('module_' + mod,
diff --git a/Misc/NEWS b/Misc/NEWS

index 30d7c50e72832fedc87f6f6ac07f8c612325a3a2..ef274eb0c8c41171f25a1a6b3eb8a8041bed3767 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
  Core and Builtins
  -----------------
  
+- Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error.
+
  - Issue #10785: Store the filename as Unicode in the Python parser.
  
  - Issue #11619: _PyImport_LoadDynamicModule() doesn't encode the path to bytes
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index 5edd9589be45b327a730a592e0266ac737a66a65..f4d7e3fc683218b5e942c67358cd13a3dc900218 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1690,17 +1690,18 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
      return result;
  }
  
-/* Get -*- encoding -*- from a Python file.
+/* Get the encoding of a Python file. Check for the coding cookie and check if
+   the file starts with a BOM.
  
-   PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
-   the first or second line of the file (in which case the encoding
-   should be assumed to be PyUnicode_GetDefaultEncoding()).
+   PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
+   encoding in the first or second line of the file (in which case the encoding
+   should be assumed to be UTF-8).
+
+   The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
+   by the caller. */
  
-   The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
-   by the caller.
-*/
  char *
-PyTokenizer_FindEncoding(int fd)
+PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
  {
      struct tok_state *tok;
      FILE *fp;
@@ -1720,9 +1721,18 @@ PyTokenizer_FindEncoding(int fd)
          return NULL;
      }
  #ifndef PGEN
-    tok->filename = PyUnicode_FromString("<string>");
-    if (tok->filename == NULL)
-        goto error;
+    if (filename != NULL) {
+        Py_INCREF(filename);
+        tok->filename = filename;
+    }
+    else {
+        tok->filename = PyUnicode_FromString("<string>");
+        if (tok->filename == NULL) {
+            fclose(fp);
+            PyTokenizer_Free(tok);
+            return encoding;
+        }
+    }
  #endif
      while (tok->lineno < 2 && tok->done == E_OK) {
          PyTokenizer_Get(tok, &p_start, &p_end);
@@ -1733,13 +1743,16 @@ PyTokenizer_FindEncoding(int fd)
          if (encoding)
          strcpy(encoding, tok->encoding);
      }
-#ifndef PGEN
-error:
-#endif
      PyTokenizer_Free(tok);
      return encoding;
  }
  
+char *
+PyTokenizer_FindEncoding(int fd)
+{
+    return PyTokenizer_FindEncodingFilename(fd, NULL);
+}
+
  #ifdef Py_DEBUG
  
  void
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h

index 3a0d3cb08e8121a2f10bd33605549d35418916bf..ed1f3aa147e9f64af10ea31799e84ed2eac9a4d0 100644 (file)
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -75,7 +75,6 @@ extern void PyTokenizer_Free(struct tok_state *);
  extern int PyTokenizer_Get(struct tok_state *, char **, char **);
  extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
                                            int len, int *offset);
-extern char * PyTokenizer_FindEncoding(int);
  
  #ifdef __cplusplus
  }
diff --git a/Python/import.c b/Python/import.c

index b074b834e90d9450addb5d1749f8cdb95f0d8638..4159a8ece6bd47453576f6f2dba5011fc52b9762 100644 (file)
--- a/Python/import.c
+++ b/Python/import.c
@@ -124,12 +124,12 @@ static const Py_UNICODE PYC_TAG_UNICODE[] = {
  /* See _PyImport_FixupExtensionObject() below */
  static PyObject *extensions = NULL;
  
+/* Function from Parser/tokenizer.c */
+extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);
+
  /* This table is defined in config.c: */
  extern struct _inittab _PyImport_Inittab[];
  
-/* Method from Parser/tokenizer.c */
-extern char * PyTokenizer_FindEncoding(int);
-
  struct _inittab *PyImport_Inittab = _PyImport_Inittab;
  
  /* these tables define the module suffixes that Python recognizes */
@@ -3540,9 +3540,9 @@ call_find_module(PyObject *name, PyObject *path_list)
      }
      if (fd != -1) {
          if (strchr(fdp->mode, 'b') == NULL) {
-            /* PyTokenizer_FindEncoding() returns PyMem_MALLOC'ed
+            /* PyTokenizer_FindEncodingFilename() returns PyMem_MALLOC'ed
                 memory. */
-            found_encoding = PyTokenizer_FindEncoding(fd);
+            found_encoding = PyTokenizer_FindEncodingFilename(fd, pathobj);
              lseek(fd, 0, 0); /* Reset position */
              if (found_encoding == NULL && PyErr_Occurred()) {
                  Py_XDECREF(pathobj);
diff --git a/Python/traceback.c b/Python/traceback.c

index f0142da7929cf0bdcb4d7e36e2f72ed855310d86..e74a1474dfdaa3df20705992e41eaac37106d5a9 100644 (file)
--- a/Python/traceback.c
+++ b/Python/traceback.c
@@ -18,8 +18,8 @@
  #define MAX_FRAME_DEPTH 100
  #define MAX_NTHREADS 100
  
-/* Method from Parser/tokenizer.c */
-extern char * PyTokenizer_FindEncoding(int);
+/* Function from Parser/tokenizer.c */
+extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);
  
  static PyObject *
  tb_dir(PyTracebackObject *self)
@@ -251,7 +251,7 @@ _Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent)
  
      /* use the right encoding to decode the file as unicode */
      fd = PyObject_AsFileDescriptor(binary);
-    found_encoding = PyTokenizer_FindEncoding(fd);
+    found_encoding = PyTokenizer_FindEncodingFilename(fd, filename);
      encoding = (found_encoding != NULL) ? found_encoding : "utf-8";
      lseek(fd, 0, 0); /* Reset position */
      fob = PyObject_CallMethod(io, "TextIOWrapper", "Os", binary, encoding);
author	Victor Stinner <victor.stinner@haypocalc.com>
	Mon, 4 Apr 2011 23:48:03 +0000 (01:48 +0200)
committer	Victor Stinner <victor.stinner@haypocalc.com>
	Mon, 4 Apr 2011 23:48:03 +0000 (01:48 +0200)
Lib/test/test_imp.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Parser/tokenizer.c		patch \| blob \| history
Parser/tokenizer.h		patch \| blob \| history
Python/import.c		patch \| blob \| history
Python/traceback.c		patch \| blob \| history