bpo-37388: Development mode check encoding and errors (GH-14341)

author Victor Stinner <vstinner@redhat.com>

Tue, 25 Jun 2019 22:51:05 +0000 (00:51 +0200)

committer GitHub <noreply@github.com>

Tue, 25 Jun 2019 22:51:05 +0000 (00:51 +0200)
author Victor Stinner <vstinner@redhat.com>
Tue, 25 Jun 2019 22:51:05 +0000 (00:51 +0200)
committer GitHub <noreply@github.com>
Tue, 25 Jun 2019 22:51:05 +0000 (00:51 +0200)
diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst

index 35a17a18080996d718dd85fc077b586c70b9cd00..8575f8a72af3197b2ffef127283038071acf5197 100644 (file)
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@@ -1559,9 +1559,16 @@ expression support in the :mod:`re` module).
     :func:`codecs.register_error`, see section :ref:`error-handlers`. For a
     list of possible encodings, see section :ref:`standard-encodings`.
  
+   By default, the *errors* argument is not checked for best performances, but
+   only used at the first encoding error. Enable the development mode
+   (:option:`-X` ``dev`` option), or use a debug build, to check *errors*.
+
     .. versionchanged:: 3.1
        Support for keyword arguments added.
  
+   .. versionchanged:: 3.9
+      The *errors* is now checked in development mode and in debug mode.
+
  
  .. method:: str.endswith(suffix[, start[, end]])
  
@@ -2575,6 +2582,10 @@ arbitrary binary data.
     :func:`codecs.register_error`, see section :ref:`error-handlers`. For a
     list of possible encodings, see section :ref:`standard-encodings`.
  
+   By default, the *errors* argument is not checked for best performances, but
+   only used at the first decoding error. Enable the development mode
+   (:option:`-X` ``dev`` option), or use a debug build, to check *errors*.
+
     .. note::
  
        Passing the *encoding* argument to :class:`str` allows decoding any
@@ -2584,6 +2595,9 @@ arbitrary binary data.
     .. versionchanged:: 3.1
        Added support for keyword arguments.
  
+   .. versionchanged:: 3.9
+      The *errors* is now checked in development mode and in debug mode.
+
  
  .. method:: bytes.endswith(suffix[, start[, end]])
              bytearray.endswith(suffix[, start[, end]])
diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst

index 87af7e8be13376f7c934304fad886d609e9579a1..e11fe31c2fbb8f92125c13e3b37b0ecdd13b04f0 100644 (file)
--- a/Doc/using/cmdline.rst
+++ b/Doc/using/cmdline.rst
@@ -429,6 +429,9 @@ Miscellaneous options
       not be more verbose than the default if the code is correct: new warnings
       are only emitted when an issue is detected. Effect of the developer mode:
  
+     * Check *encoding* and *errors* arguments on string encoding and decoding
+       operations. Examples: :func:`open`, :meth:`str.encode` and
+       :meth:`bytes.decode`.
       * Add ``default`` warning filter, as :option:`-W` ``default``.
       * Install debug hooks on memory allocators: see the
         :c:func:`PyMem_SetupDebugHooks` C function.
@@ -469,6 +472,10 @@ Miscellaneous options
        The ``-X pycache_prefix`` option. The ``-X dev`` option now logs
        ``close()`` exceptions in :class:`io.IOBase` destructor.
  
+   .. versionchanged:: 3.9
+      Using ``-X dev`` option, check *encoding* and *errors* arguments on
+      string encoding and decoding operations.
+
  
  Options you shouldn't use
  ~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Doc/whatsnew/3.9.rst b/Doc/whatsnew/3.9.rst

index b58c99b88daf695f5436fc3e0caf5053a07a802d..e9a31c80ea4d836dfbc7657b0e83589b2f3bc366 100644 (file)
--- a/Doc/whatsnew/3.9.rst
+++ b/Doc/whatsnew/3.9.rst
@@ -84,6 +84,15 @@ Other Language Changes
    this case.
    (Contributed by Victor Stinner in :issue:`20443`.)
  
+* In development mode and in debug build, *encoding* and *errors* arguments are
+  now checked on string encoding and decoding operations. Examples:
+  :func:`open`, :meth:`str.encode` and :meth:`bytes.decode`.
+
+  By default, for best performances, the *errors* argument is only checked at
+  the first encoding/decoding error, and the *encoding* argument is sometimes
+  ignored for empty strings.
+  (Contributed by Victor Stinner in :issue:`37388`.)
+
  
  New Modules
  ===========
diff --git a/Lib/_pyio.py b/Lib/_pyio.py

index 0b6493bc8dc926281803f955c4eb99c358512ce7..c35516430589c06b94cdb5b80d1742a09722e653 100644 (file)
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py
@@ -36,6 +36,8 @@ BlockingIOError = BlockingIOError
  # Does io.IOBase finalizer log the exception if the close() method fails?
  # The exception is ignored silently by default in release build.
  _IOBASE_EMITS_UNRAISABLE = (hasattr(sys, "gettotalrefcount") or sys.flags.dev_mode)
+# Does open() check its 'errors' argument?
+_CHECK_ERRORS = _IOBASE_EMITS_UNRAISABLE
  
  
  def open(file, mode="r", buffering=-1, encoding=None, errors=None,
@@ -2022,6 +2024,8 @@ class TextIOWrapper(TextIOBase):
          else:
              if not isinstance(errors, str):
                  raise ValueError("invalid errors: %r" % errors)
+            if _CHECK_ERRORS:
+                codecs.lookup_error(errors)
  
          self._buffer = buffer
          self._decoded_chars = ''  # buffer for text returned from decoder
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py

index bbd45c75298e72ac0427a4a284673ae21c38a520..b5eeb2b4fc2901185603bde08a80ba34a0e0beb3 100644 (file)
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -12,12 +12,14 @@ import copy
  import functools
  import pickle
  import tempfile
+import textwrap
  import unittest
  
  import test.support
  import test.string_tests
  import test.list_tests
  from test.support import bigaddrspacetest, MAX_Py_ssize_t
+from test.support.script_helper import assert_python_failure
  
  
  if sys.flags.bytes_warning:
@@ -315,6 +317,62 @@ class BaseBytesTest:
          # Default encoding is utf-8
          self.assertEqual(self.type2test(b'\xe2\x98\x83').decode(), '\u2603')
  
+    def test_check_encoding_errors(self):
+        # bpo-37388: bytes(str) and bytes.encode() must check encoding
+        # and errors arguments in dev mode
+        invalid = 'Boom, Shaka Laka, Boom!'
+        encodings = ('ascii', 'utf8', 'latin1')
+        code = textwrap.dedent(f'''
+            import sys
+            type2test = {self.type2test.__name__}
+            encodings = {encodings!r}
+
+            for data in ('', 'short string'):
+                try:
+                    type2test(data, encoding={invalid!r})
+                except LookupError:
+                    pass
+                else:
+                    sys.exit(21)
+
+                for encoding in encodings:
+                    try:
+                        type2test(data, encoding=encoding, errors={invalid!r})
+                    except LookupError:
+                        pass
+                    else:
+                        sys.exit(22)
+
+            for data in (b'', b'short string'):
+                data = type2test(data)
+                print(repr(data))
+                try:
+                    data.decode(encoding={invalid!r})
+                except LookupError:
+                    sys.exit(10)
+                else:
+                    sys.exit(23)
+
+                try:
+                    data.decode(errors={invalid!r})
+                except LookupError:
+                    pass
+                else:
+                    sys.exit(24)
+
+                for encoding in encodings:
+                    try:
+                        data.decode(encoding=encoding, errors={invalid!r})
+                    except LookupError:
+                        pass
+                    else:
+                        sys.exit(25)
+
+            sys.exit(10)
+        ''')
+        proc = assert_python_failure('-X', 'dev', '-c', code)
+        self.assertEqual(proc.rc, 10, proc)
+
      def test_from_int(self):
          b = self.type2test(0)
          self.assertEqual(b, self.type2test())
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py

index fc474c99053df2356cd273a3b28e8016663b21c0..1fe1cba5167fc6e630c3ee7dd4c0a97b19eabfb1 100644 (file)
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -29,6 +29,7 @@ import random
  import signal
  import sys
  import sysconfig
+import textwrap
  import threading
  import time
  import unittest
@@ -37,7 +38,8 @@ import weakref
  from collections import deque, UserList
  from itertools import cycle, count
  from test import support
-from test.support.script_helper import assert_python_ok, run_python_until_end
+from test.support.script_helper import (
+    assert_python_ok, assert_python_failure, run_python_until_end)
  from test.support import FakePath
  
  import codecs
@@ -4130,6 +4132,51 @@ class MiscIOTest(unittest.TestCase):
          # there used to be a buffer overflow in the parser for rawmode
          self.assertRaises(ValueError, self.open, support.TESTFN, 'rwax+')
  
+    def test_check_encoding_errors(self):
+        # bpo-37388: open() and TextIOWrapper must check encoding and errors
+        # arguments in dev mode
+        mod = self.io.__name__
+        filename = __file__
+        invalid = 'Boom, Shaka Laka, Boom!'
+        code = textwrap.dedent(f'''
+            import sys
+            from {mod} import open, TextIOWrapper
+
+            try:
+                open({filename!r}, encoding={invalid!r})
+            except LookupError:
+                pass
+            else:
+                sys.exit(21)
+
+            try:
+                open({filename!r}, errors={invalid!r})
+            except LookupError:
+                pass
+            else:
+                sys.exit(22)
+
+            fp = open({filename!r}, "rb")
+            with fp:
+                try:
+                    TextIOWrapper(fp, encoding={invalid!r})
+                except LookupError:
+                    pass
+                else:
+                    sys.exit(23)
+
+                try:
+                    TextIOWrapper(fp, errors={invalid!r})
+                except LookupError:
+                    pass
+                else:
+                    sys.exit(24)
+
+            sys.exit(10)
+        ''')
+        proc = assert_python_failure('-X', 'dev', '-c', code)
+        self.assertEqual(proc.rc, 10, proc)
+
  
  class CMiscIOTest(MiscIOTest):
      io = io
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py

index 36b72e40c7e4193962c6aceac9a1194344fcd369..177d80d27e1a41d1cae466b123d80f34fcf94d09 100644 (file)
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -11,9 +11,11 @@ import itertools
  import operator
  import struct
  import sys
+import textwrap
  import unittest
  import warnings
  from test import support, string_tests
+from test.support.script_helper import assert_python_failure
  
  # Error handling (bad decoder return)
  def search_function(encoding):
@@ -2436,6 +2438,66 @@ class UnicodeTest(string_tests.CommonTest,
          support.check_free_after_iterating(self, iter, str)
          support.check_free_after_iterating(self, reversed, str)
  
+    def test_check_encoding_errors(self):
+        # bpo-37388: str(bytes) and str.decode() must check encoding and errors
+        # arguments in dev mode
+        encodings = ('ascii', 'utf8', 'latin1')
+        invalid = 'Boom, Shaka Laka, Boom!'
+        code = textwrap.dedent(f'''
+            import sys
+            encodings = {encodings!r}
+
+            for data in (b'', b'short string'):
+                try:
+                    str(data, encoding={invalid!r})
+                except LookupError:
+                    pass
+                else:
+                    sys.exit(21)
+
+                try:
+                    str(data, errors={invalid!r})
+                except LookupError:
+                    pass
+                else:
+                    sys.exit(22)
+
+                for encoding in encodings:
+                    try:
+                        str(data, encoding, errors={invalid!r})
+                    except LookupError:
+                        pass
+                    else:
+                        sys.exit(22)
+
+            for data in ('', 'short string'):
+                try:
+                    data.encode(encoding={invalid!r})
+                except LookupError:
+                    pass
+                else:
+                    sys.exit(23)
+
+                try:
+                    data.encode(errors={invalid!r})
+                except LookupError:
+                    pass
+                else:
+                    sys.exit(24)
+
+                for encoding in encodings:
+                    try:
+                        data.encode(encoding, errors={invalid!r})
+                    except LookupError:
+                        pass
+                    else:
+                        sys.exit(24)
+
+            sys.exit(10)
+        ''')
+        proc = assert_python_failure('-X', 'dev', '-c', code)
+        self.assertEqual(proc.rc, 10, proc)
+
  
  class CAPITest(unittest.TestCase):
  
diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst b/Misc/NEWS.d/next/Core and Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst

new file mode 100644 (file)

index 0000000..295ef1b
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst
@@ -0,0 +1,7 @@
+In development mode and in debug build, *encoding* and *errors* arguments are
+now checked on string encoding and decoding operations. Examples: :func:`open`,
+:meth:`str.encode` and :meth:`bytes.decode`.
+
+By default, for best performances, the *errors* argument is only checked at the
+first encoding/decoding error, and the *encoding* argument is sometimes ignored
+for empty strings.
diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c

index 73b2756afce5e641ae028ce93f3d051d1dce1d6f..021231e4c6b58c5fef2b7e6a6f1d040a26b6ba52 100644 (file)
--- a/Modules/_io/textio.c
+++ b/Modules/_io/textio.c
@@ -988,6 +988,46 @@ _textiowrapper_fix_encoder_state(textio *self)
      return 0;
  }
  
+static int
+io_check_errors(PyObject *errors)
+{
+    assert(errors != NULL && errors != Py_None);
+
+    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
+#ifndef Py_DEBUG
+    /* In release mode, only check in development mode (-X dev) */
+    if (!interp->config.dev_mode) {
+        return 0;
+    }
+#else
+    /* Always check in debug mode */
+#endif
+
+    /* Avoid calling PyCodec_LookupError() before the codec registry is ready:
+       before_PyUnicode_InitEncodings() is called. */
+    if (!interp->fs_codec.encoding) {
+        return 0;
+    }
+
+    Py_ssize_t name_length;
+    const char *name = PyUnicode_AsUTF8AndSize(errors, &name_length);
+    if (name == NULL) {
+        return -1;
+    }
+    if (strlen(name) != (size_t)name_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character in errors");
+        return -1;
+    }
+    PyObject *handler = PyCodec_LookupError(name);
+    if (handler != NULL) {
+        Py_DECREF(handler);
+        return 0;
+    }
+    return -1;
+}
+
+
+
  /*[clinic input]
  _io.TextIOWrapper.__init__
      buffer: object
@@ -1057,6 +1097,9 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
              errors->ob_type->tp_name);
          return -1;
      }
+    else if (io_check_errors(errors)) {
+        return -1;
+    }
  
      if (validate_newline(newline) < 0) {
          return -1;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index cb1456ea847acfd813d3c5f37c46e9692789fe43..b6f3d8f18c4c52c241d83bfe8eea3be12e303dca 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -427,6 +427,48 @@ get_error_handler_wide(const wchar_t *errors)
  }
  
  
+static inline int
+unicode_check_encoding_errors(const char *encoding, const char *errors)
+{
+    if (encoding == NULL && errors == NULL) {
+        return 0;
+    }
+
+    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
+#ifndef Py_DEBUG
+    /* In release mode, only check in development mode (-X dev) */
+    if (!interp->config.dev_mode) {
+        return 0;
+    }
+#else
+    /* Always check in debug mode */
+#endif
+
+    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
+       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
+    if (!interp->fs_codec.encoding) {
+        return 0;
+    }
+
+    if (encoding != NULL) {
+        PyObject *handler = _PyCodec_Lookup(encoding);
+        if (handler == NULL) {
+            return -1;
+        }
+        Py_DECREF(handler);
+    }
+
+    if (errors != NULL) {
+        PyObject *handler = PyCodec_LookupError(errors);
+        if (handler == NULL) {
+            return -1;
+        }
+        Py_DECREF(handler);
+    }
+    return 0;
+}
+
+
  /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
     This function is kept for backward compatibility with the old API. */
  Py_UNICODE
@@ -3211,12 +3253,15 @@ PyUnicode_FromEncodedObject(PyObject *obj,
  
      /* Decoding bytes objects is the most common case and should be fast */
      if (PyBytes_Check(obj)) {
-        if (PyBytes_GET_SIZE(obj) == 0)
+        if (PyBytes_GET_SIZE(obj) == 0) {
+            if (unicode_check_encoding_errors(encoding, errors) < 0) {
+                return NULL;
+            }
              _Py_RETURN_UNICODE_EMPTY();
-        v = PyUnicode_Decode(
+        }
+        return PyUnicode_Decode(
                  PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
                  encoding, errors);
-        return v;
      }
  
      if (PyUnicode_Check(obj)) {
@@ -3235,6 +3280,9 @@ PyUnicode_FromEncodedObject(PyObject *obj,
  
      if (buffer.len == 0) {
          PyBuffer_Release(&buffer);
+        if (unicode_check_encoding_errors(encoding, errors) < 0) {
+            return NULL;
+        }
          _Py_RETURN_UNICODE_EMPTY();
      }
  
@@ -3302,6 +3350,10 @@ PyUnicode_Decode(const char *s,
      Py_buffer info;
      char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
  
+    if (unicode_check_encoding_errors(encoding, errors) < 0) {
+        return NULL;
+    }
+
      if (encoding == NULL) {
          return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
      }
@@ -3562,7 +3614,8 @@ PyUnicode_EncodeFSDefault(PyObject *unicode)
         cannot use it to encode and decode filenames before it is loaded. Load
         the Python codec requires to encode at least its own filename. Use the C
         implementation of the locale codec until the codec registry is
-       initialized and the Python codec is loaded. See initfsencoding(). */
+       initialized and the Python codec is loaded.
+       See _PyUnicode_InitEncodings(). */
      if (interp->fs_codec.encoding) {
          return PyUnicode_AsEncodedString(unicode,
                                           interp->fs_codec.encoding,
@@ -3591,6 +3644,10 @@ PyUnicode_AsEncodedString(PyObject *unicode,
          return NULL;
      }
  
+    if (unicode_check_encoding_errors(encoding, errors) < 0) {
+        return NULL;
+    }
+
      if (encoding == NULL) {
          return _PyUnicode_AsUTF8String(unicode, errors);
      }
@@ -3800,7 +3857,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
         cannot use it to encode and decode filenames before it is loaded. Load
         the Python codec requires to encode at least its own filename. Use the C
         implementation of the locale codec until the codec registry is
-       initialized and the Python codec is loaded. See initfsencoding(). */
+       initialized and the Python codec is loaded.
+       See _PyUnicode_InitEncodings(). */
      if (interp->fs_codec.encoding) {
          return PyUnicode_Decode(s, size,
                                  interp->fs_codec.encoding,
author	Victor Stinner <vstinner@redhat.com>
	Tue, 25 Jun 2019 22:51:05 +0000 (00:51 +0200)
committer	GitHub <noreply@github.com>
	Tue, 25 Jun 2019 22:51:05 +0000 (00:51 +0200)
Doc/library/stdtypes.rst		patch \| blob \| history
Doc/using/cmdline.rst		patch \| blob \| history
Doc/whatsnew/3.9.rst		patch \| blob \| history
Lib/_pyio.py		patch \| blob \| history
Lib/test/test_bytes.py		patch \| blob \| history
Lib/test/test_io.py		patch \| blob \| history
Lib/test/test_unicode.py		patch \| blob \| history
Misc/NEWS.d/next/Core and Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst	[new file with mode: 0644]	patch \| blob
Modules/_io/textio.c		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history