]> granicus.if.org Git - python/commitdiff
Issue #19619: Blacklist non-text codecs in method API
authorSerhiy Storchaka <storchaka@gmail.com>
Mon, 24 Feb 2014 12:43:03 +0000 (14:43 +0200)
committerSerhiy Storchaka <storchaka@gmail.com>
Mon, 24 Feb 2014 12:43:03 +0000 (14:43 +0200)
str.encode, bytes.decode and bytearray.decode now use an
internal API to throw LookupError for known non-text encodings,
rather than attempting the encoding or decoding operation and
then throwing a TypeError for an unexpected output type.

The latter mechanism remains in place for third party non-text
encodings.

Backported changeset d68df99d7a57.

13 files changed:
Include/codecs.h
Lib/codecs.py
Lib/encodings/base64_codec.py
Lib/encodings/bz2_codec.py
Lib/encodings/hex_codec.py
Lib/encodings/quopri_codec.py
Lib/encodings/rot_13.py
Lib/encodings/uu_codec.py
Lib/encodings/zlib_codec.py
Lib/test/test_codecs.py
Misc/NEWS
Objects/unicodeobject.c
Python/codecs.c

index 0d9e9b4585dbad4b22930e0c4fd40b5d8a72f2b4..5ca505fbd5ec042229054add7311887090044094 100644 (file)
@@ -94,6 +94,33 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode(
        const char *errors
        );
 
+#ifndef PY_LIMITED_API
+/* Text codec specific encoding and decoding API.
+
+   Checks the encoding against a list of codecs which do not
+   implement a str<->bytes encoding before attempting the
+   operation.
+
+   Please note that these APIs are internal and should not
+   be used in Python C extensions.
+
+ */
+
+PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
+       PyObject *object,
+       const char *encoding,
+       const char *errors
+       );
+
+PyAPI_FUNC(PyObject *) _PyCodec_DecodeText(
+       PyObject *object,
+       const char *encoding,
+       const char *errors
+       );
+#endif
+
+
+
 /* --- Codec Lookup APIs -------------------------------------------------- 
 
    All APIs return a codec object with incremented refcount and are
index 01ae0f3ea63e861dd93293cae444990c951cd04c..c2065dafa27ceb87377717c163fc6331c8b5d110 100644 (file)
@@ -73,9 +73,19 @@ BOM64_BE = BOM_UTF32_BE
 ### Codec base classes (defining the API)
 
 class CodecInfo(tuple):
+    """Codec details when looking up the codec registry"""
+
+    # Private API to allow Python 3.4 to blacklist the known non-Unicode
+    # codecs in the standard library. A more general mechanism to
+    # reliably distinguish test encodings from other codecs will hopefully
+    # be defined for Python 3.5
+    #
+    # See http://bugs.python.org/issue19619
+    _is_text_encoding = True # Assume codecs are text encodings by default
 
     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
-        incrementalencoder=None, incrementaldecoder=None, name=None):
+        incrementalencoder=None, incrementaldecoder=None, name=None,
+        *, _is_text_encoding=None):
         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
         self.name = name
         self.encode = encode
@@ -84,6 +94,8 @@ class CodecInfo(tuple):
         self.incrementaldecoder = incrementaldecoder
         self.streamwriter = streamwriter
         self.streamreader = streamreader
+        if _is_text_encoding is not None:
+            self._is_text_encoding = _is_text_encoding
         return self
 
     def __repr__(self):
index 321a961782a596678755ed3bf2607bafa67b597e..881d1ba0beeb5983f64f54d26ac60ab1dc0fc5f8 100644 (file)
@@ -52,4 +52,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )
index e65d226bfdff209a4f682480cfa6074e0f268480..fd9495e341baee1437dad343f537938f71066efd 100644 (file)
@@ -74,4 +74,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )
index e003fc3ff97ea77de8df3e8a2da09803aaf2d400..f2ed0a7658e23eb68f53bcd4440d785651a05d74 100644 (file)
@@ -52,4 +52,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )
index 9243fc443b02180888cc3be1d9e029b6779e0d5d..70f708379381fc65611fe723104f46961c23610f 100644 (file)
@@ -53,4 +53,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )
index 3140c1432dcb5531e3f8f399f04b931c21b9fdfa..fff9153b4c6344871cd5747c5475ea95f5b4b80a 100755 (executable)
@@ -43,6 +43,7 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )
 
 ### Map
index 69c6f17c7f3178af3f430ae34975d277a53fd76d..e3269e40cd306e1cb42586af87e7cde2d2115a48 100644 (file)
@@ -96,4 +96,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_text_encoding=False,
     )
index e0b9cdadbcd5d90fe2761b9ec1a8d58c40ff06d9..4c81ca115a7b5a0b3ebfe0a65bf8a2d2f204ab8a 100644 (file)
@@ -74,4 +74,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_text_encoding=False,
     )
index 1a199f7175d531805af201c827f7865cc21cc0f3..a8b3da0f370b5a70e203b4179c51ed6f8771cdfb 100644 (file)
@@ -4,6 +4,7 @@ import locale
 import sys
 import unittest
 import warnings
+import encodings
 
 from test import support
 
@@ -2408,6 +2409,47 @@ class TransformCodecTest(unittest.TestCase):
             sout = reader.readline()
             self.assertEqual(sout, b"\x80")
 
+    def test_text_to_binary_blacklists_binary_transforms(self):
+        # Check binary -> binary codecs give a good error for str input
+        bad_input = "bad input type"
+        for encoding in bytes_transform_encodings:
+            fmt = (r"{!r} is not a text encoding; "
+                   r"use codecs.encode\(\) to handle arbitrary codecs")
+            msg = fmt.format(encoding)
+            with self.assertRaisesRegex(LookupError, msg) as failure:
+                bad_input.encode(encoding)
+            self.assertIsNone(failure.exception.__cause__)
+
+    def test_text_to_binary_blacklists_text_transforms(self):
+        # Check str.encode gives a good error message for str -> str codecs
+        msg = (r"^'rot_13' is not a text encoding; "
+               r"use codecs.encode\(\) to handle arbitrary codecs")
+        with self.assertRaisesRegex(LookupError, msg):
+            "just an example message".encode("rot_13")
+
+    def test_binary_to_text_blacklists_binary_transforms(self):
+        # Check bytes.decode and bytearray.decode give a good error
+        # message for binary -> binary codecs
+        data = b"encode first to ensure we meet any format restrictions"
+        for encoding in bytes_transform_encodings:
+            encoded_data = codecs.encode(data, encoding)
+            fmt = (r"{!r} is not a text encoding; "
+                   r"use codecs.decode\(\) to handle arbitrary codecs")
+            msg = fmt.format(encoding)
+            with self.assertRaisesRegex(LookupError, msg):
+                encoded_data.decode(encoding)
+            with self.assertRaisesRegex(LookupError, msg):
+                bytearray(encoded_data).decode(encoding)
+
+    def test_binary_to_text_blacklists_text_transforms(self):
+        # Check str -> str codec gives a good error for binary input
+        for bad_input in (b"immutable", bytearray(b"mutable")):
+            msg = (r"^'rot_13' is not a text encoding; "
+                   r"use codecs.decode\(\) to handle arbitrary codecs")
+            with self.assertRaisesRegex(LookupError, msg) as failure:
+                bad_input.decode("rot_13")
+            self.assertIsNone(failure.exception.__cause__)
+
 
 @unittest.skipUnless(sys.platform == 'win32',
                      'code pages are specific to Windows')
index 50ac145daadadf991290493f6dd2ff35eb88ff66..4774f3790605aaef317ea2d3709e468affa058cf 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,12 @@ What's New in Python 3.3.5 release candidate 1?
 Core and Builtins
 -----------------
 
+- Issue #19619: str.encode, bytes.decode and bytearray.decode now use an
+  internal API to throw LookupError for known non-text encodings, rather
+  than attempting the encoding or decoding operation and then throwing a
+  TypeError for an unexpected output type. (The latter mechanism remains
+  in place for third party non-text encodings)
+
 - Issue #20588: Make Python-ast.c C89 compliant.
 
 - Issue #20437: Fixed 21 potential bugs when deleting objects references.
index 89094e000f91f15296ad7c4875ce58fadb9b5070..03007531bd8149e66550b5e6111aa80ebd76e1d4 100644 (file)
@@ -3129,7 +3129,7 @@ PyUnicode_Decode(const char *s,
     buffer = PyMemoryView_FromBuffer(&info);
     if (buffer == NULL)
         goto onError;
-    unicode = PyCodec_Decode(buffer, encoding, errors);
+    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
     if (unicode == NULL)
         goto onError;
     if (!PyUnicode_Check(unicode)) {
@@ -3489,7 +3489,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
     }
 
     /* Encode via the codec registry */
-    v = PyCodec_Encode(unicode, encoding, errors);
+    v = _PyCodec_EncodeText(unicode, encoding, errors);
     if (v == NULL)
         return NULL;
 
index fd67d1b9e183bc846422b7182f8bdb5c40d7acfd..5ebc4cb5f6e9e535e4ac41257f15cb279108ce76 100644 (file)
@@ -337,18 +337,15 @@ PyObject *PyCodec_StreamWriter(const char *encoding,
 
    errors is passed to the encoder factory as argument if non-NULL. */
 
-PyObject *PyCodec_Encode(PyObject *object,
-                         const char *encoding,
-                         const char *errors)
+static PyObject *
+_PyCodec_EncodeInternal(PyObject *object,
+                        PyObject *encoder,
+                        const char *encoding,
+                        const char *errors)
 {
-    PyObject *encoder = NULL;
     PyObject *args = NULL, *result = NULL;
     PyObject *v = NULL;
 
-    encoder = PyCodec_Encoder(encoding);
-    if (encoder == NULL)
-        goto onError;
-
     args = args_tuple(object, errors);
     if (args == NULL)
         goto onError;
@@ -384,18 +381,15 @@ PyObject *PyCodec_Encode(PyObject *object,
 
    errors is passed to the decoder factory as argument if non-NULL. */
 
-PyObject *PyCodec_Decode(PyObject *object,
-                         const char *encoding,
-                         const char *errors)
+static PyObject *
+_PyCodec_DecodeInternal(PyObject *object,
+                        PyObject *decoder,
+                        const char *encoding,
+                        const char *errors)
 {
-    PyObject *decoder = NULL;
     PyObject *args = NULL, *result = NULL;
     PyObject *v;
 
-    decoder = PyCodec_Decoder(encoding);
-    if (decoder == NULL)
-        goto onError;
-
     args = args_tuple(object, errors);
     if (args == NULL)
         goto onError;
@@ -425,6 +419,118 @@ PyObject *PyCodec_Decode(PyObject *object,
     return NULL;
 }
 
+/* Generic encoding/decoding API */
+PyObject *PyCodec_Encode(PyObject *object,
+                         const char *encoding,
+                         const char *errors)
+{
+    PyObject *encoder;
+
+    encoder = PyCodec_Encoder(encoding);
+    if (encoder == NULL)
+        return NULL;
+
+    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
+}
+
+PyObject *PyCodec_Decode(PyObject *object,
+                         const char *encoding,
+                         const char *errors)
+{
+    PyObject *decoder;
+
+    decoder = PyCodec_Decoder(encoding);
+    if (decoder == NULL)
+        return NULL;
+
+    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
+}
+
+/* Text encoding/decoding API */
+static
+PyObject *codec_getitem_checked(const char *encoding,
+                                const char *operation_name,
+                                int index)
+{
+    _Py_IDENTIFIER(_is_text_encoding);
+    PyObject *codec;
+    PyObject *attr;
+    PyObject *v;
+    int is_text_codec;
+
+    codec = _PyCodec_Lookup(encoding);
+    if (codec == NULL)
+        return NULL;
+
+    /* Backwards compatibility: assume any raw tuple describes a text
+     * encoding, and the same for anything lacking the private
+     * attribute.
+     */
+    if (!PyTuple_CheckExact(codec)) {
+        attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
+        if (attr == NULL) {
+            if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+                PyErr_Clear();
+            } else {
+                Py_DECREF(codec);
+                return NULL;
+            }
+        } else {
+            is_text_codec = PyObject_IsTrue(attr);
+            Py_DECREF(attr);
+            if (!is_text_codec) {
+                Py_DECREF(codec);
+                PyErr_Format(PyExc_LookupError,
+                             "'%.400s' is not a text encoding; "
+                             "use codecs.%s() to handle arbitrary codecs",
+                             encoding, operation_name);
+                return NULL;
+            }
+        }
+    }
+
+    v = PyTuple_GET_ITEM(codec, index);
+    Py_DECREF(codec);
+    Py_INCREF(v);
+    return v;
+}
+
+static PyObject * _PyCodec_TextEncoder(const char *encoding)
+{
+    return codec_getitem_checked(encoding, "encode", 0);
+}
+
+static PyObject * _PyCodec_TextDecoder(const char *encoding)
+{
+    return codec_getitem_checked(encoding, "decode", 1);
+}
+
+PyObject *_PyCodec_EncodeText(PyObject *object,
+                              const char *encoding,
+                              const char *errors)
+{
+    PyObject *encoder;
+
+    encoder = _PyCodec_TextEncoder(encoding);
+    if (encoder == NULL)
+        return NULL;
+
+    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
+}
+
+PyObject *_PyCodec_DecodeText(PyObject *object,
+                              const char *encoding,
+                              const char *errors)
+{
+    PyObject *decoder;
+
+    decoder = _PyCodec_TextDecoder(encoding);
+    if (decoder == NULL)
+        return NULL;
+
+    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
+}
+
 /* Register the error handling callback function error under the name
    name. This function will be called by the codec when it encounters
    an unencodable characters/undecodable bytes and doesn't know the