Issue 28128: Print out better error/warning messages for invalid string escapes....

author Eric V. Smith <eric@trueblade.com>

Mon, 31 Oct 2016 18:46:26 +0000 (14:46 -0400)

committer Eric V. Smith <eric@trueblade.com>

Mon, 31 Oct 2016 18:46:26 +0000 (14:46 -0400)
author Eric V. Smith <eric@trueblade.com>
Mon, 31 Oct 2016 18:46:26 +0000 (14:46 -0400)
committer Eric V. Smith <eric@trueblade.com>
Mon, 31 Oct 2016 18:46:26 +0000 (14:46 -0400)
diff --git a/Include/bytesobject.h b/Include/bytesobject.h

index 11d8218402dc1de65beb2ada88a2f36d6a2a2949..98e29b687914f7f8f380f0b1330b42214b098169 100644 (file)
--- a/Include/bytesobject.h
+++ b/Include/bytesobject.h
@@ -74,6 +74,11 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
  PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
                                                    const char *, Py_ssize_t,
                                                    const char *);
+/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
+PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
+                                             const char *, Py_ssize_t,
+                                             const char *,
+                                             const char **);
  
  /* Macro, trading safety for speed */
  #ifndef Py_LIMITED_API
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h

index 5711de0164eacad23e75dcbcf37defd9f32f4a24..b5ef3e4130383c4cc379aaf5b02e1e71acc589bc 100644 (file)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1486,6 +1486,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
      const char *errors          /* error handling */
      );
  
+/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
+   chars. */
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
+        const char *string,     /* Unicode-Escape encoded string */
+        Py_ssize_t length,      /* size of string */
+        const char *errors,     /* error handling */
+        const char **first_invalid_escape  /* on return, points to first
+                                              invalid escaped char in
+                                              string. */
+);
+
  PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
      PyObject *unicode           /* Unicode object */
      );
diff --git a/Lib/test/test_string_literals.py b/Lib/test/test_string_literals.py

index 37ace230f5e62822dce29c57ca8531ae4d523e68..54f2be3598396a99012883915ddc46b8de069774 100644 (file)
--- a/Lib/test/test_string_literals.py
+++ b/Lib/test/test_string_literals.py
@@ -31,6 +31,7 @@ import os
  import sys
  import shutil
  import tempfile
+import warnings
  import unittest
  
  
@@ -104,6 +105,19 @@ class TestLiterals(unittest.TestCase):
          self.assertRaises(SyntaxError, eval, r""" '\U000000' """)
          self.assertRaises(SyntaxError, eval, r""" '\U0000000' """)
  
+    def test_eval_str_invalid_escape(self):
+        for b in range(1, 128):
+            if b in b"""\n\r"'01234567NU\\abfnrtuvx""":
+                continue
+            with self.assertWarns(DeprecationWarning):
+                self.assertEqual(eval(r"'\%c'" % b), '\\' + chr(b))
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always', category=DeprecationWarning)
+            eval("'''\n\\z'''")
+        self.assertEqual(len(w), 1)
+        self.assertEqual(w[0].filename, '<string>')
+        self.assertEqual(w[0].lineno, 2)
+
      def test_eval_str_raw(self):
          self.assertEqual(eval(""" r'x' """), 'x')
          self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01')
@@ -130,6 +144,19 @@ class TestLiterals(unittest.TestCase):
          self.assertRaises(SyntaxError, eval, r""" b'\x' """)
          self.assertRaises(SyntaxError, eval, r""" b'\x0' """)
  
+    def test_eval_bytes_invalid_escape(self):
+        for b in range(1, 128):
+            if b in b"""\n\r"'01234567\\abfnrtvx""":
+                continue
+            with self.assertWarns(DeprecationWarning):
+                self.assertEqual(eval(r"b'\%c'" % b), b'\\' + bytes([b]))
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always', category=DeprecationWarning)
+            eval("b'''\n\\z'''")
+        self.assertEqual(len(w), 1)
+        self.assertEqual(w[0].filename, '<string>')
+        self.assertEqual(w[0].lineno, 2)
+
      def test_eval_bytes_raw(self):
          self.assertEqual(eval(""" br'x' """), b'x')
          self.assertEqual(eval(""" rb'x' """), b'x')
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py

index fe6cd28c63fd5c973cdeaf10328dd977c056e4d7..0737140ccfa878ec8f9b905c000ee6f508f77d57 100644 (file)
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -2413,13 +2413,6 @@ class UnicodeTest(string_tests.CommonTest,
          support.check_free_after_iterating(self, iter, str)
          support.check_free_after_iterating(self, reversed, str)
  
-    def test_invalid_sequences(self):
-        for letter in string.ascii_letters + "89": # 0-7 are octal escapes
-            if letter in "abfnrtuvxNU":
-                continue
-            with self.assertWarns(DeprecationWarning):
-                eval(r"'\%s'" % letter)
-
  
  class CAPITest(unittest.TestCase):
  
diff --git a/Misc/NEWS b/Misc/NEWS

index a621be316f8a38597a48e47703797df042fdc04f..c9bfe017bd90a54082b1c357306eb796f9c6f0a7 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@ What's New in Python 3.6.0 beta 3
  Core and Builtins
  -----------------
  
+- Issue #28128: Deprecation warning for invalid str and byte escape
+  sequences now prints better information about where the error
+  occurs. Patch by Serhiy Storchaka and Eric Smith.
+
  - Issue #28509: dict.update() no longer allocate unnecessary large memory.
  
  - Issue #28426: Fixed potential crash in PyUnicode_AsDecodedObject() in debug
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c

index 598f6a13cfcf7278258a54b5e18a02a6460f3854..779fe295db00eda8f7a255a6029bacd84a692185 100644 (file)
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -1105,11 +1105,12 @@ _PyBytes_DecodeEscapeRecode(const char **s, const char *end,
      return p;
  }
  
-PyObject *PyBytes_DecodeEscape(const char *s,
+PyObject *_PyBytes_DecodeEscape(const char *s,
                                  Py_ssize_t len,
                                  const char *errors,
                                  Py_ssize_t unicode,
-                                const char *recode_encoding)
+                                const char *recode_encoding,
+                                const char **first_invalid_escape)
  {
      int c;
      char *p;
@@ -1123,6 +1124,8 @@ PyObject *PyBytes_DecodeEscape(const char *s,
          return NULL;
      writer.overallocate = 1;
  
+    *first_invalid_escape = NULL;
+
      end = s + len;
      while (s < end) {
          if (*s != '\\') {
@@ -1207,9 +1210,12 @@ PyObject *PyBytes_DecodeEscape(const char *s,
              break;
  
          default:
-            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "invalid escape sequence '\\%c'", *(--s)) < 0)
-                goto failed;
+            if (*first_invalid_escape == NULL) {
+                *first_invalid_escape = s-1; /* Back up one char, since we've
+                                                already incremented s. */
+            }
              *p++ = '\\';
+            s--;
              goto non_esc; /* an arbitrary number of unescaped
                               UTF-8 bytes may follow. */
          }
@@ -1222,6 +1228,29 @@ PyObject *PyBytes_DecodeEscape(const char *s,
      return NULL;
  }
  
+PyObject *PyBytes_DecodeEscape(const char *s,
+                                Py_ssize_t len,
+                                const char *errors,
+                                Py_ssize_t unicode,
+                                const char *recode_encoding)
+{
+    const char* first_invalid_escape;
+    PyObject *result = _PyBytes_DecodeEscape(s, len, errors, unicode,
+                                             recode_encoding,
+                                             &first_invalid_escape);
+    if (result == NULL)
+        return NULL;
+    if (first_invalid_escape != NULL) {
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                             "invalid escape sequence '\\%c'",
+                             *first_invalid_escape) < 0) {
+            Py_DECREF(result);
+            return NULL;
+        }
+    }
+    return result;
+
+}
  /* -------------------------------------------------------------------- */
  /* object api */
  
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index e45f3d7c27b923c05d167860e0cc92ea5f23a6a3..50b21cf9e65822ce3dd31ad51f1fdd30c0cb87ac 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -5896,9 +5896,10 @@ PyUnicode_AsUTF16String(PyObject *unicode)
  static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
  
  PyObject *
-PyUnicode_DecodeUnicodeEscape(const char *s,
-                              Py_ssize_t size,
-                              const char *errors)
+_PyUnicode_DecodeUnicodeEscape(const char *s,
+                               Py_ssize_t size,
+                               const char *errors,
+                               const char **first_invalid_escape)
  {
      const char *starts = s;
      _PyUnicodeWriter writer;
@@ -5906,6 +5907,9 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
      PyObject *errorHandler = NULL;
      PyObject *exc = NULL;
  
+    // so we can remember if we've seen an invalid escape char or not
+    *first_invalid_escape = NULL;
+
      if (size == 0) {
          _Py_RETURN_UNICODE_EMPTY();
      }
@@ -6080,9 +6084,10 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
              goto error;
  
          default:
-            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
-                                 "invalid escape sequence '\\%c'", c) < 0)
-                goto onError;
+            if (*first_invalid_escape == NULL) {
+                *first_invalid_escape = s-1; /* Back up one char, since we've
+                                                already incremented s. */
+            }
              WRITE_ASCII_CHAR('\\');
              WRITE_CHAR(c);
              continue;
@@ -6117,6 +6122,27 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
      return NULL;
  }
  
+PyObject *
+PyUnicode_DecodeUnicodeEscape(const char *s,
+                              Py_ssize_t size,
+                              const char *errors)
+{
+    const char *first_invalid_escape;
+    PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
+                                                      &first_invalid_escape);
+    if (result == NULL)
+        return NULL;
+    if (first_invalid_escape != NULL) {
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                             "invalid escape sequence '\\%c'",
+                             *first_invalid_escape) < 0) {
+            Py_DECREF(result);
+            return NULL;
+        }
+    }
+    return result;
+}
+
  /* Return a Unicode-Escape string version of the Unicode object.
  
     If quotes is true, the string is enclosed in u"" or u'' quotes as
diff --git a/Python/ast.c b/Python/ast.c

index 76daf6f446205a72ff73097aa384e5101bbdd71e..91e7d0129f47caf572b13d5b6365068020d0aaa1 100644 (file)
--- a/Python/ast.c
+++ b/Python/ast.c
@@ -4113,8 +4113,34 @@ decode_utf8(struct compiling *c, const char **sPtr, const char *end)
      return PyUnicode_DecodeUTF8(t, s - t, NULL);
  }
  
+static int
+warn_invalid_escape_sequence(struct compiling *c, const node *n,
+                             char first_invalid_escape_char)
+{
+    PyObject *msg = PyUnicode_FromFormat("invalid escape sequence \\%c",
+                                         first_invalid_escape_char);
+    if (msg == NULL) {
+        return -1;
+    }
+    if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg,
+                                   c->c_filename, LINENO(n),
+                                   NULL, NULL) < 0 &&
+        PyErr_ExceptionMatches(PyExc_DeprecationWarning))
+    {
+        const char *s = PyUnicode_AsUTF8(msg);
+        if (s != NULL) {
+            ast_error(c, n, s);
+        }
+        Py_DECREF(msg);
+        return -1;
+    }
+    Py_DECREF(msg);
+    return 0;
+}
+
  static PyObject *
-decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len)
+decode_unicode_with_escapes(struct compiling *c, const node *n, const char *s,
+                            size_t len)
  {
      PyObject *v, *u;
      char *buf;
@@ -4167,11 +4193,41 @@ decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len)
      len = p - buf;
      s = buf;
  
-    v = PyUnicode_DecodeUnicodeEscape(s, len, NULL);
+    const char *first_invalid_escape;
+    v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
+
+    if (v != NULL && first_invalid_escape != NULL) {
+        if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) {
+            /* We have not decref u before because first_invalid_escape points
+               inside u. */
+            Py_XDECREF(u);
+            Py_DECREF(v);
+            return NULL;
+        }
+    }
      Py_XDECREF(u);
      return v;
  }
  
+static PyObject *
+decode_bytes_with_escapes(struct compiling *c, const node *n, const char *s,
+                          size_t len)
+{
+    const char *first_invalid_escape;
+    PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, 0, NULL,
+                                             &first_invalid_escape);
+    if (result == NULL)
+        return NULL;
+
+    if (first_invalid_escape != NULL) {
+        if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) {
+            Py_DECREF(result);
+            return NULL;
+        }
+    }
+    return result;
+}
+
  /* Compile this expression in to an expr_ty.  Add parens around the
     expression, in order to allow leading spaces in the expression. */
  static expr_ty
@@ -4310,7 +4366,7 @@ done:
                                                      literal_end-literal_start,
                                                      NULL, NULL);
          else
-            *literal = decode_unicode_with_escapes(c, literal_start,
+            *literal = decode_unicode_with_escapes(c, n, literal_start,
                                                     literal_end-literal_start);
          if (!*literal)
              return -1;
@@ -5048,12 +5104,12 @@ parsestr(struct compiling *c, const node *n, int *bytesmode, int *rawmode,
          if (*rawmode)
              *result = PyBytes_FromStringAndSize(s, len);
          else
-            *result = PyBytes_DecodeEscape(s, len, NULL, /* ignored */ 0, NULL);
+            *result = decode_bytes_with_escapes(c, n, s, len);
      } else {
          if (*rawmode)
              *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
          else
-            *result = decode_unicode_with_escapes(c, s, len);
+            *result = decode_unicode_with_escapes(c, n, s, len);
      }
      return *result == NULL ? -1 : 0;
  }
author	Eric V. Smith <eric@trueblade.com>
	Mon, 31 Oct 2016 18:46:26 +0000 (14:46 -0400)
committer	Eric V. Smith <eric@trueblade.com>
	Mon, 31 Oct 2016 18:46:26 +0000 (14:46 -0400)
Include/bytesobject.h		patch \| blob \| history
Include/unicodeobject.h		patch \| blob \| history
Lib/test/test_string_literals.py		patch \| blob \| history
Lib/test/test_unicode.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Objects/bytesobject.c		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history
Python/ast.c		patch \| blob \| history