Patch #1031213: Decode source line in SyntaxErrors back to its original

author Martin v. Löwis <martin@v.loewis.de>

Tue, 4 Sep 2007 14:20:25 +0000 (14:20 +0000)

committer Martin v. Löwis <martin@v.loewis.de>

Tue, 4 Sep 2007 14:20:25 +0000 (14:20 +0000)
author Martin v. Löwis <martin@v.loewis.de>
Tue, 4 Sep 2007 14:20:25 +0000 (14:20 +0000)
committer Martin v. Löwis <martin@v.loewis.de>
Tue, 4 Sep 2007 14:20:25 +0000 (14:20 +0000)
diff --git a/Lib/test/test_compiler.py b/Lib/test/test_compiler.py

index 229d8a370fde6c05c937aa394709f7b3f256f4c6..606ed700dfd049ad072c1ae6e72494add0fdb1fb 100644 (file)
--- a/Lib/test/test_compiler.py
+++ b/Lib/test/test_compiler.py
@@ -155,6 +155,32 @@ class CompilerTest(unittest.TestCase):
          self.assertEquals(dct.get('result'), 1)
  
  
+    def _testErrEnc(self, src, text, offset):
+        try:
+            compile(src, "", "exec")
+        except SyntaxError, e:
+            self.assertEquals(e.offset, offset)
+            self.assertEquals(e.text, text)
+
+    def testSourceCodeEncodingsError(self):
+        # Test SyntaxError with encoding definition
+        sjis = "print '\x83\x70\x83\x43\x83\x5c\x83\x93', '\n"
+        ascii = "print '12345678', '\n"
+        encdef = "#! -*- coding: ShiftJIS -*-\n"
+
+        # ascii source without encdef
+        self._testErrEnc(ascii, ascii, 19)
+
+        # ascii source with encdef
+        self._testErrEnc(encdef+ascii, ascii, 19)
+
+        # non-ascii source with encdef
+        self._testErrEnc(encdef+sjis, sjis, 19)
+
+        # ShiftJIS source without encdef
+        self._testErrEnc(sjis, sjis, 19)
+
+
  NOLINENO = (compiler.ast.Module, compiler.ast.Stmt, compiler.ast.Discard)
  
  ###############################################################################
diff --git a/Misc/NEWS b/Misc/NEWS

index c2e9a86fdcad012c436bda5fedb7e9869fc77af7..9280438c25ce9505ddb975cb9c14bab8fe39fc3c 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,9 @@ What's New in Python 2.5.2c1?
  Core and builtins
  -----------------
  
+- Patch #1031213: Decode source line in SyntaxErrors back to its original source
+  encoding.
+
  - Patch #1673759: add a missing overflow check when formatting floats
    with %G.
  
diff --git a/Parser/parsetok.c b/Parser/parsetok.c

index be53e1c5912f2e19bd9fbd81e4e840372dc30123..6494a9381cf88a4e5896c2817f24bcc5e1e4038f 100644 (file)
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@@ -216,16 +216,24 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
                         err_ret->error = E_EOF;
                 err_ret->lineno = tok->lineno;
                 if (tok->buf != NULL) {
+                       char *text = NULL;
                         size_t len;
                         assert(tok->cur - tok->buf < INT_MAX);
                         err_ret->offset = (int)(tok->cur - tok->buf);
                         len = tok->inp - tok->buf;
-                       err_ret->text = (char *) PyObject_MALLOC(len + 1);
-                       if (err_ret->text != NULL) {
-                               if (len > 0)
-                                       strncpy(err_ret->text, tok->buf, len);
-                               err_ret->text[len] = '\0';
+#ifdef Py_USING_UNICODE
+                       text = PyTokenizer_RestoreEncoding(tok, len, &err_ret->offset);
+
+#endif
+                       if (text == NULL) {
+                               text = (char *) PyObject_MALLOC(len + 1);
+                               if (text != NULL) {
+                                       if (len > 0)
+                                               strncpy(text, tok->buf, len);
+                                       text[len] = '\0';
+                               }
                         }
+                       err_ret->text = text;
                 }
         } else if (tok->encoding != NULL) {
                 node* r = PyNode_New(encoding_decl);
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index c58b6899b37e6b4853c03bcd560d59f0a8252b63..f6df9bb944919fcfbf521083615f362cd8ec92b0 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1522,6 +1522,68 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
         return result;
  }
  
+/* This function is only called from parsetok. However, it cannot live
+   there, as it must be empty for PGEN, and we can check for PGEN only
+   in this file. */
+
+#ifdef PGEN
+char*
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
+{
+       return NULL;
+}
+#else
+static PyObject *
+dec_utf8(const char *enc, const char *text, size_t len) {
+       PyObject *ret = NULL;   
+       PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
+       if (unicode_text) {
+               ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
+               Py_DECREF(unicode_text);
+       }
+       if (!ret) {
+               PyErr_Print();
+       }
+       return ret;
+}
+
+char *
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
+{
+       char *text = NULL;
+       if (tok->encoding) {
+               /* convert source to original encondig */
+               PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
+               if (lineobj != NULL) {
+                       int linelen = PyString_Size(lineobj);
+                       const char *line = PyString_AsString(lineobj);
+                       text = PyObject_MALLOC(linelen + 1);
+                       if (text != NULL && line != NULL) {
+                               if (linelen)
+                                       strncpy(text, line, linelen);
+                               text[linelen] = '\0';
+                       }
+                       Py_DECREF(lineobj);
+                                       
+                       /* adjust error offset */
+                       if (*offset > 1) {
+                               PyObject *offsetobj = dec_utf8(tok->encoding, 
+                                                              tok->buf, *offset-1);
+                               if (offsetobj) {
+                                       *offset = PyString_Size(offsetobj) + 1;
+                                       Py_DECREF(offsetobj);
+                               }
+                       }
+                       
+               }
+       }
+       return text;
+
+}
+#endif
+
+                          
+
  #ifdef Py_DEBUG
  
  void
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h

index 5e7ebf74f11551890713df5e903dd76b6b1d5ccb..8482cddfc54f84f054edb92bf0c9f8c5fc64cc95 100644 (file)
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -58,6 +58,8 @@ extern struct tok_state *PyTokenizer_FromString(const char *);
  extern struct tok_state *PyTokenizer_FromFile(FILE *, char *, char *);
  extern void PyTokenizer_Free(struct tok_state *);
  extern int PyTokenizer_Get(struct tok_state *, char **, char **);
+extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok, 
+                                         int len, int *offset);
  
  #ifdef __cplusplus
  }
author	Martin v. Löwis <martin@v.loewis.de>
	Tue, 4 Sep 2007 14:20:25 +0000 (14:20 +0000)
committer	Martin v. Löwis <martin@v.loewis.de>
	Tue, 4 Sep 2007 14:20:25 +0000 (14:20 +0000)
Lib/test/test_compiler.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Parser/parsetok.c		patch \| blob \| history
Parser/tokenizer.c		patch \| blob \| history
Parser/tokenizer.h		patch \| blob \| history