| ``'backslashreplace'`` | Replace with backslashed escape sequences |
| | (only for encoding). |
+-------------------------+-----------------------------------------------+
-| ``'utf8b'`` | Replace byte with surrogate U+DCxx. |
+| ``'surrogateescape'`` | Replace byte with surrogate U+DCxx. |
+-------------------------+-----------------------------------------------+
In addition, the following error handlers are specific to a single codec:
+-------------------+---------+-------------------------------------------+
.. versionadded:: 3.1
- The ``'utf8b'`` and ``'surrogatepass'`` error handlers.
+ The ``'surrogateescape'`` and ``'surrogatepass'`` error handlers.
The set of allowed values can be extended via :meth:`register_error`.
.. versionchanged:: 3.1
On some systems, conversion using the file system encoding may
- fail. In this case, Python uses the ``utf8b`` encoding error
- handler, which means that undecodable bytes are replaced by a
+ fail. In this case, Python uses the ``surrogateescape`` encoding
+ error handler, which means that undecodable bytes are replaced by a
Unicode character U+DCxx on decoding, and these are again
translated to the original byte on encoding.
self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
-class Utf8bTest(unittest.TestCase):
+class SurrogateEscapeTest(unittest.TestCase):
def test_utf8(self):
# Bad byte
- self.assertEqual(b"foo\x80bar".decode("utf-8", "utf8b"),
+ self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
"foo\udc80bar")
- self.assertEqual("foo\udc80bar".encode("utf-8", "utf8b"),
+ self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
b"foo\x80bar")
# bad-utf-8 encoded surrogate
- self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "utf8b"),
+ self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
"\udced\udcb0\udc80")
- self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "utf8b"),
+ self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
b"\xed\xb0\x80")
def test_ascii(self):
# bad byte
- self.assertEqual(b"foo\x80bar".decode("ascii", "utf8b"),
+ self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
"foo\udc80bar")
- self.assertEqual("foo\udc80bar".encode("ascii", "utf8b"),
+ self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
b"foo\x80bar")
def test_charmap(self):
# bad byte: \xa5 is unmapped in iso-8859-3
- self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "utf8b"),
+ self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
"foo\udca5bar")
- self.assertEqual("foo\udca5bar".encode("iso-8859-3", "utf8b"),
+ self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
b"foo\xa5bar")
CharmapTest,
WithStmtTest,
TypesTest,
- Utf8bTest,
+ SurrogateEscapeTest,
)
self.fsencoding = sys.getfilesystemencoding()
sys.setfilesystemencoding("utf-8")
self.dir = support.TESTFN
- self.bdir = self.dir.encode("utf-8", "utf8b")
+ self.bdir = self.dir.encode("utf-8", "surrogateescape")
os.mkdir(self.dir)
self.unicodefn = []
for fn in self.filenames:
f = open(os.path.join(self.bdir, fn), "w")
f.close()
- self.unicodefn.append(fn.decode("utf-8", "utf8b"))
+ self.unicodefn.append(fn.decode("utf-8", "surrogateescape"))
def tearDown(self):
shutil.rmtree(self.dir)
return -1;
stringobj = PyUnicode_AsEncodedString(
- u, Py_FileSystemDefaultEncoding, "utf8b");
+ u, Py_FileSystemDefaultEncoding, "surrogateescape");
Py_DECREF(u);
if (stringobj == NULL)
return -1;
if (p == NULL)
continue;
k = PyUnicode_Decode(*e, (int)(p-*e),
- Py_FileSystemDefaultEncoding, "utf8b");
+ Py_FileSystemDefaultEncoding, "surrogateescape");
if (k == NULL) {
PyErr_Clear();
continue;
}
v = PyUnicode_Decode(p+1, strlen(p+1),
- Py_FileSystemDefaultEncoding, "utf8b");
+ Py_FileSystemDefaultEncoding, "surrogateescape");
if (v == NULL) {
PyErr_Clear();
Py_DECREF(k);
return posix_error();
if (use_bytes)
return PyBytes_FromStringAndSize(buf, strlen(buf));
- return PyUnicode_Decode(buf, strlen(buf), Py_FileSystemDefaultEncoding,"utf8b");
+ return PyUnicode_Decode(buf, strlen(buf), Py_FileSystemDefaultEncoding,"surrogateescape");
}
PyDoc_STRVAR(posix_getcwd__doc__,
w = PyUnicode_FromEncodedObject(v,
Py_FileSystemDefaultEncoding,
- "utf8b");
+ "surrogateescape");
Py_DECREF(v);
if (w != NULL)
v = w;
w = PyUnicode_FromEncodedObject(v,
Py_FileSystemDefaultEncoding,
- "utf8b");
+ "surrogateescape");
if (w != NULL) {
Py_DECREF(v);
v = w;
return res;
PyMem_Free(res);
}
- /* Conversion failed. Fall back to escaping with utf8b. */
+ /* Conversion failed. Fall back to escaping with surrogateescape. */
#ifdef HAVE_MBRTOWC
/* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
return 0;
output = PyUnicode_AsEncodedObject(arg,
Py_FileSystemDefaultEncoding,
- "utf8b");
+ "surrogateescape");
Py_DECREF(arg);
if (!output)
return 0;
}
static PyObject *
-PyCodec_UTF8bErrors(PyObject *exc)
+PyCodec_SurrogateEscapeErrors(PyObject *exc)
{
PyObject *restuple;
PyObject *object;
return PyCodec_SurrogatePassErrors(exc);
}
-static PyObject *utf8b_errors(PyObject *self, PyObject *exc)
+static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
{
- return PyCodec_UTF8bErrors(exc);
+ return PyCodec_SurrogateEscapeErrors(exc);
}
static int _PyCodecRegistry_Init(void)
}
},
{
- "utf8b",
+ "surrogateescape",
{
- "utf8b",
- utf8b_errors,
+ "surrogateescape",
+ surrogateescape_errors,
METH_O
}
}