--- /dev/null
+import test.test_support, unittest
+import sys, codecs, htmlentitydefs, unicodedata
+class CodecCallbackTest(unittest.TestCase):
+ def test_xmlcharrefreplace(self):
+ # replace unencodable characters which numeric character entities.
+ # For ascii, latin-1 and charmaps this is completely implemented
+ # in C and should be reasonably fast.
+ s = u"\u30b9\u30d1\u30e2 \xe4nd eggs"
+ self.assertEqual(
+ s.encode("ascii", "xmlcharrefreplace"),
+ "スパモ änd eggs"
+ )
+ self.assertEqual(
+ s.encode("latin-1", "xmlcharrefreplace"),
+ "スパモ \xe4nd eggs"
+ )
+ def test_xmlcharnamereplace(self):
+ # This time use a named character entity for unencodable
+ # characters, if one is available.
+ names = {}
+ for (key, value) in htmlentitydefs.entitydefs.items():
+ if len(value)==1:
+ names[unicode(value, "latin-1")] = unicode(key, "latin-1")
+ else:
+ names[unichr(int(value[2:-1]))] = unicode(key, "latin-1")
+ def xmlcharnamereplace(exc):
+ if not isinstance(exc, UnicodeEncodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ l = []
+ for c in exc.object[exc.start:exc.end]:
+ try:
+ l.append(u"&%s;" % names[c])
+ except KeyError:
+ l.append(u"&#%d;" % ord(c))
+ return (u"".join(l), exc.end)
+ codecs.register_error(
+ "test.xmlcharnamereplace", xmlcharnamereplace)
+ sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
+ sout = "«ℜ» = ⟨ሴ€⟩"
+ self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
+ sout = "\xabℜ\xbb = ⟨ሴ€⟩"
+ self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
+ sout = "\xabℜ\xbb = ⟨ሴ\xa4⟩"
+ self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
+ def test_uninamereplace(self):
+ # We're using the names from the unicode database this time,
+ # and we're doing "systax highlighting" here, i.e. we include
+ # the replaced text in ANSI escape sequences. For this it is
+ # useful that the error handler is not called for every single
+ # unencodable character, but for a complete sequence of
+ # unencodable characters, otherwise we would output many
+ # unneccessary escape sequences.
+ def uninamereplace(exc):
+ if not isinstance(exc, UnicodeEncodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ l = []
+ for c in exc.object[exc.start:exc.end]:
+ l.append(unicodedata.name(c, u"0x%x" % ord(c)))
+ return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end)
+ codecs.register_error(
+ "test.uninamereplace", uninamereplace)
+ sin = u"\xac\u1234\u20ac\u8000"
+ sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m"
+ self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
+ sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m"
+ self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
+ sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1m0x8000\033[0m"
+ self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
+ def test_backslashescape(self):
+ # Does the same as the "unicode-escape" encoding, but with different
+ # base encodings.
+ sin = u"a\xac\u1234\u20ac\u8000"
+ if sys.maxunicode > 0xffff:
+ sin += unichr(sys.maxunicode)
+ sout = "a\\xac\\u1234\\u20ac\\u8000"
+ if sys.maxunicode > 0xffff:
+ sout += "\\U%08x" % sys.maxunicode
+ self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
+ sout = "a\xac\\u1234\\u20ac\\u8000"
+ if sys.maxunicode > 0xffff:
+ sout += "\\U%08x" % sys.maxunicode
+ self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
+ sout = "a\xac\\u1234\xa4\\u8000"
+ if sys.maxunicode > 0xffff:
+ sout += "\\U%08x" % sys.maxunicode
+ self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
+ def test_relaxedutf8(self):
+ # This is the test for a decoding callback handler,
+ # that relaxes the UTF-8 minimal encoding restriction.
+ # A null byte that is encoded as "\xc0\x80" will be
+ # decoded as a null byte. All other illegal sequences
+ # will be handled strictly.
+ def relaxedutf8(exc):
+ if not isinstance(exc, UnicodeDecodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ if exc.object[exc.start:exc.end].startswith("\xc0\x80"):
+ return (u"\x00", exc.start+2) # retry after two bytes
+ else:
+ raise exc
+ codecs.register_error(
+ "test.relaxedutf8", relaxedutf8)
+ sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
+ sout = u"a\x00b\x00c\xfc\x00\x00"
+ self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
+ sin = "\xc0\x80\xc0\x81"
+ self.assertRaises(UnicodeError, sin.decode, "utf-8", "test.relaxedutf8")
+ def test_charmapencode(self):
+ # For charmap encodings the replacement string will be
+ # mapped through the encoding again. This means, that
+ # to be able to use e.g. the "replace" handler, the
+ # charmap has to have a mapping for "?".
+ charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"])
+ sin = u"abc"
+ sout = "AABBCC"
+ self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
+ sin = u"abcA"
+ self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
+ charmap[ord("?")] = "XYZ"
+ sin = u"abcDEF"
+ self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
+ charmap[ord("?")] = u"XYZ"
+ self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
+ charmap[ord("?")] = u"XYZ"
+ self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
+ def test_callbacks(self):
+ def handler1(exc):
+ if not isinstance(exc, UnicodeEncodeError) \
+ and not isinstance(exc, UnicodeDecodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
+ return (u"[%s]" % u"".join(l), exc.end)
+ codecs.register_error("test.handler1", handler1)
+ def handler2(exc):
+ if not isinstance(exc, UnicodeDecodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
+ return (u"[%s]" % u"".join(l), exc.end+1) # skip one character
+ codecs.register_error("test.handler2", handler2)
+ s = "\x00\x81\x7f\x80\xff"
+ self.assertEqual(
+ s.decode("ascii", "test.handler1"),
+ u"\x00[<129>]\x7f[<128>][<255>]"
+ )
+ self.assertEqual(
+ s.decode("ascii", "test.handler2"),
+ u"\x00[<129>][<128>]"
+ )
+ self.assertEqual(
+ "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"),
+ u"\u3042[<92><117><51><120>]xx"
+ )
+ self.assertEqual(
+ "\\u3042\u3xx".decode("unicode-escape", "test.handler1"),
+ u"\u3042[<92><117><51><120><120>]"
+ )
+ self.assertEqual(
+ codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0],
+ u"z[<98>][<99>]"
+ )
+ self.assertEqual(
+ u"g\xfc\xdfrk".encode("ascii", "test.handler1"),
+ u"g[<252><223>]rk"
+ )
+ self.assertEqual(
+ u"g\xfc\xdf".encode("ascii", "test.handler1"),
+ u"g[<252><223>]"
+ )
+ def test_longstrings(self):
+ # test long strings to check for memory overflow problems
+ errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
+ # register the handlers under different names,
+ # to prevent the codec from recognizing the name
+ for err in errors:
+ codecs.register_error("test." + err, codecs.lookup_error(err))
+ l = 1000
+ errors += [ "test." + err for err in errors ]
+ for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
+ for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
+ for err in errors:
+ try:
+ uni.encode(enc, err)
+ except UnicodeError:
+ pass
+ def check_exceptionobjectargs(self, exctype, args, msg):
+ # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
+ # check with one missing argument
+ self.assertRaises(TypeError, exctype, *args[:-1])
+ # check with one missing argument
+ self.assertRaises(TypeError, exctype, *(args + ["too much"]))
+ # check with one argument of the wrong type
+ wrongargs = [ "spam", u"eggs", 42, 1.0, None ]
+ for i in xrange(len(args)):
+ for wrongarg in wrongargs:
+ if type(wrongarg) is type(args[i]):
+ continue
+ # build argument array
+ callargs = []
+ for j in xrange(len(args)):
+ if i==j:
+ callargs.append(wrongarg)
+ else:
+ callargs.append(args[i])
+ self.assertRaises(TypeError, exctype, *callargs)
+ exc = exctype(*args)
+ self.assertEquals(str(exc), msg)
+ def test_unicodeencodeerror(self):
+ self.check_exceptionobjectargs(
+ UnicodeEncodeError,
+ ["ascii", u"g\xfcrk", 1, 2, "ouch"],
+ "'ascii' codec can't encode character '\ufc' in position 1: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeEncodeError,
+ ["ascii", u"g\xfcrk", 1, 4, "ouch"],
+ "'ascii' codec can't encode characters in position 1-3: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeEncodeError,
+ ["ascii", u"\xfcx", 0, 1, "ouch"],
+ "'ascii' codec can't encode character '\ufc' in position 0: ouch"
+ )
+ def test_unicodedecodeerror(self):
+ self.check_exceptionobjectargs(
+ UnicodeDecodeError,
+ ["ascii", "g\xfcrk", 1, 2, "ouch"],
+ "'ascii' codec can't decode byte 0xfc in position 1: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeDecodeError,
+ ["ascii", "g\xfcrk", 1, 3, "ouch"],
+ "'ascii' codec can't decode bytes in position 1-2: ouch"
+ )
+ def test_unicodetranslateerror(self):
+ self.check_exceptionobjectargs(
+ UnicodeTranslateError,
+ [u"g\xfcrk", 1, 2, "ouch"],
+ "can't translate character '\\ufc' in position 1: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeTranslateError,
+ [u"g\xfcrk", 1, 3, "ouch"],
+ "can't translate characters in position 1-2: ouch"
+ )
+ def test_badandgoodstrictexceptions(self):
+ self.assertRaises(
+ TypeError,
+ codecs.strict_errors,
+ 42
+ )
+ self.assertRaises(
+ Exception,
+ codecs.strict_errors,
+ Exception("ouch")
+ )
+ self.assertRaises(
+ UnicodeEncodeError,
+ codecs.strict_errors,
+ UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")
+ )
+ def test_badandgoodignoreexceptions(self):
+ self.assertRaises(
+ TypeError,
+ codecs.ignore_errors,
+ 42
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.ignore_errors,
+ UnicodeError("ouch")
+ )
+ self.assertEquals(
+ codecs.ignore_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+ (u"", 1)
+ )
+ self.assertEquals(
+ codecs.ignore_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
+ (u"", 1)
+ )
+ self.assertEquals(
+ codecs.ignore_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")),
+ (u"", 1)
+ )
+ def test_badandgoodreplaceexceptions(self):
+ self.assertRaises(
+ TypeError,
+ codecs.replace_errors,
+ 42
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.replace_errors,
+ UnicodeError("ouch")
+ )
+ self.assertEquals(
+ codecs.replace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+ (u"?", 1)
+ )
+ self.assertEquals(
+ codecs.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
+ (u"\ufffd", 1)
+ )
+ self.assertEquals(
+ codecs.replace_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")),
+ (u"\ufffd", 1)
+ )
+ def test_badandgoodxmlcharrefreplaceexceptions(self):
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ 42
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ UnicodeError("ouch")
+ )
+ self.assertEquals(
+ codecs.xmlcharrefreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+ (u"&#%d;" % 0x3042, 1)
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ UnicodeError("ouch")
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
+ )
+ def test_badandgoodbackslashreplaceexceptions(self):
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ 42
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ UnicodeError("ouch")
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+ (u"\\u3042", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\x00", 0, 1, "ouch")),
+ (u"\\x00", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\xff", 0, 1, "ouch")),
+ (u"\\xff", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u0100", 0, 1, "ouch")),
+ (u"\\u0100", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\uffff", 0, 1, "ouch")),
+ (u"\\uffff", 1)
+ )
+ if sys.maxunicode>0xffff:
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U00010000", 0, 1, "ouch")),
+ (u"\\U00010000", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U0010ffff", 0, 1, "ouch")),
+ (u"\\U0010ffff", 1)
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ UnicodeError("ouch")
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
+ )
+ def test_badhandlerresults(self):
+ results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
+ encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
+ for res in results:
+ codecs.register_error("test.badhandler", lambda: res)
+ for enc in encs:
+ self.assertRaises(
+ TypeError,
+ u"\u3042".encode,
+ enc,
+ "test.badhandler"
+ )
+ for (enc, bytes) in (
+ ("ascii", "\xff"),
+ ("utf-8", "\xff"),
+ ("utf-7", "+x-")
+ ):
+ self.assertRaises(
+ TypeError,
+ bytes.decode,
+ enc,
+ "test.badhandler"
+ )
+ def test_lookup(self):
+ self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict"))
+ self.assertEquals(codecs.ignore_errors, codecs.lookup_error("ignore"))
+ self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict"))
+ self.assertEquals(
+ codecs.xmlcharrefreplace_errors,
+ codecs.lookup_error("xmlcharrefreplace")
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors,
+ codecs.lookup_error("backslashreplace")
+ )
+def test_main():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(CodecCallbackTest))
+ test.test_support.run_suite(suite)
+if __name__ == "__main__":
+ test_main()
const char *errors)
PyObject *buffer = NULL, *unicode;
- if (encoding == NULL)
+ if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
/* Shortcuts for common default encodings */
return -1;
+/* error handling callback helper:
+ build arguments, call the callback and check the arguments,
+ if no exception occured, copy the replacement to the output
+ and adjust various state variables.
+ return 0 on success, -1 on error
+int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
+ const char *encoding, const char *reason,
+ const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
+ PyObject **output, int *outpos, Py_UNICODE **outptr)
+ static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
+ PyObject *restuple = NULL;
+ PyObject *repunicode = NULL;
+ int outsize = PyUnicode_GET_SIZE(*output);
+ int requiredsize;
+ int newpos;
+ Py_UNICODE *repptr;
+ int repsize;
+ int res = -1;
+ if (*errorHandler == NULL) {
+ *errorHandler = PyCodec_LookupError(errors);
+ if (*errorHandler == NULL)
+ goto onError;
+ }
+ if (*exceptionObject == NULL) {
+ *exceptionObject = PyUnicodeDecodeError_Create(
+ encoding, input, insize, *startinpos, *endinpos, reason);
+ if (*exceptionObject == NULL)
+ goto onError;
+ }
+ else {
+ if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
+ goto onError;
+ if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
+ goto onError;
+ if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
+ goto onError;
+ }
+ restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
+ if (restuple == NULL)
+ goto onError;
+ if (!PyTuple_Check(restuple)) {
+ PyErr_Format(PyExc_TypeError, &argparse[4]);
+ goto onError;
+ }
+ if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
+ goto onError;
+ if (newpos<0)
+ newpos = 0;
+ else if (newpos>insize)
+ newpos = insize;
+ /* need more space? (at least enough for what we
+ have+the replacement+the rest of the string (starting
+ at the new input position), so we won't have to check space
+ when there are no errors in the rest of the string) */
+ repptr = PyUnicode_AS_UNICODE(repunicode);
+ repsize = PyUnicode_GET_SIZE(repunicode);
+ requiredsize = *outpos + repsize + insize-newpos;
+ if (requiredsize > outsize) {
+ if (requiredsize<2*outsize)
+ requiredsize = 2*outsize;
+ if (PyUnicode_Resize(output, requiredsize))
+ goto onError;
+ *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
+ }
+ *endinpos = newpos;
+ *inptr = input + newpos;
+ Py_UNICODE_COPY(*outptr, repptr, repsize);
+ *outptr += repsize;
+ *outpos += repsize;
+ /* we made it! */
+ res = 0;
+ onError:
+ Py_XDECREF(restuple);
+ return res;
/* --- UTF-7 Codec -------------------------------------------------------- */
/* see RFC2152 for details */
} \
} \
-int utf7_decoding_error(Py_UNICODE **dest,
- const char *errors,
- const char *details)
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "UTF-7 decoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- if (dest != NULL) {
- (*dest)++;
- }
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "UTF-7 decoding error; unknown error handling code: %.400s",
- errors);
- return -1;
- }
PyObject *PyUnicode_DecodeUTF7(const char *s,
int size,
const char *errors)
+ const char *starts = s;
+ int startinpos;
+ int endinpos;
+ int outpos;
const char *e;
PyUnicodeObject *unicode;
int inShift = 0;
unsigned int bitsleft = 0;
unsigned long charsleft = 0;
- int surrogate = 0;
+ int surrogate = 0;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
unicode = _PyUnicode_New(size);
if (!unicode)
e = s + size;
while (s < e) {
- Py_UNICODE ch = *s;
+ Py_UNICODE ch;
+ restart:
+ ch = *s;
if (inShift) {
if ((ch == '-') || !B64CHAR(ch)) {
else if ( ch == '+' ) {
+ startinpos = s-starts;
if (s < e && *s == '-') {
- if (utf7_decoding_error(&p, errors, errmsg))
- goto onError;
+ outpos = p-PyUnicode_AS_UNICODE(unicode);
+ endinpos = s-starts;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "utf7", errmsg,
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&unicode, &outpos, &p))
+ goto onError;
if (inShift) {
- if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
+ outpos = p-PyUnicode_AS_UNICODE(unicode);
+ endinpos = size;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "utf7", "unterminated shift sequence",
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&unicode, &outpos, &p))
goto onError;
+ if (s < e)
+ goto restart;
- if (_PyUnicode_Resize(&unicode, p - unicode->str))
+ if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
goto onError;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return (PyObject *)unicode;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
-int utf8_decoding_error(const char **source,
- Py_UNICODE **dest,
- const char *errors,
- const char *details)
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "UTF-8 decoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- (*source)++;
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- (*source)++;
- (*dest)++;
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "UTF-8 decoding error; unknown error handling code: %.400s",
- errors);
- return -1;
- }
PyObject *PyUnicode_DecodeUTF8(const char *s,
int size,
const char *errors)
+ const char *starts = s;
int n;
+ int startinpos;
+ int endinpos;
+ int outpos;
const char *e;
PyUnicodeObject *unicode;
const char *errmsg = "";
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
/* Note: size will always be longer than the resulting Unicode
character count */
if (s + n > e) {
errmsg = "unexpected end of data";
+ startinpos = s-starts;
+ endinpos = size;
goto utf8Error;
case 0:
errmsg = "unexpected code byte";
+ startinpos = s-starts;
+ endinpos = startinpos+1;
goto utf8Error;
case 1:
errmsg = "internal error";
+ startinpos = s-starts;
+ endinpos = startinpos+1;
goto utf8Error;
case 2:
if ((s[1] & 0xc0) != 0x80) {
errmsg = "invalid data";
+ startinpos = s-starts;
+ endinpos = startinpos+2;
goto utf8Error;
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
if (ch < 0x80) {
+ startinpos = s-starts;
+ endinpos = startinpos+2;
errmsg = "illegal encoding";
goto utf8Error;
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80) {
errmsg = "invalid data";
+ startinpos = s-starts;
+ endinpos = startinpos+3;
goto utf8Error;
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
errmsg = "illegal encoding";
+ startinpos = s-starts;
+ endinpos = startinpos+3;
goto utf8Error;
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80) {
errmsg = "invalid data";
+ startinpos = s-starts;
+ endinpos = startinpos+4;
goto utf8Error;
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
UTF-16 */
errmsg = "illegal encoding";
+ startinpos = s-starts;
+ endinpos = startinpos+4;
goto utf8Error;
/* Other sizes are only needed for UCS-4 */
errmsg = "unsupported Unicode code range";
+ startinpos = s-starts;
+ endinpos = startinpos+n;
goto utf8Error;
s += n;
- if (utf8_decoding_error(&s, &p, errors, errmsg))
- goto onError;
+ outpos = p-PyUnicode_AS_UNICODE(unicode);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "utf8", errmsg,
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&unicode, &outpos, &p))
+ goto onError;
/* Adjust length */
if (_PyUnicode_Resize(&unicode, p - unicode->str))
goto onError;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return (PyObject *)unicode;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
/* --- UTF-16 Codec ------------------------------------------------------- */
-int utf16_decoding_error(Py_UNICODE **dest,
- const char *errors,
- const char *details)
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "UTF-16 decoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- if (dest) {
- (*dest)++;
- }
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "UTF-16 decoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
- }
PyObject *
PyUnicode_DecodeUTF16(const char *s,
int size,
const char *errors,
int *byteorder)
+ const char *starts = s;
+ int startinpos;
+ int endinpos;
+ int outpos;
PyUnicodeObject *unicode;
const unsigned char *q, *e;
int ihi = 0, ilo = 1;
- /* size should be an even number */
- if (size & 1) {
- if (utf16_decoding_error(NULL, errors, "truncated data"))
- return NULL;
- --size; /* else ignore the oddball byte */
- }
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
/* Note: size will always be longer than the resulting Unicode
character count */
while (q < e) {
- Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
+ Py_UNICODE ch;
+ /* remaing bytes at the end? (size should be even) */
+ if (e-q<2) {
+ errmsg = "truncated data";
+ startinpos = ((const char *)q)-starts;
+ endinpos = ((const char *)e)-starts;
+ goto utf16Error;
+ /* The remaining input chars are ignored if the callback
+ chooses to skip the input */
+ }
+ ch = (q[ihi] << 8) | q[ilo];
q += 2;
if (ch < 0xD800 || ch > 0xDFFF) {
/* UTF-16 code pair: */
if (q >= e) {
errmsg = "unexpected end of data";
+ startinpos = (((const char *)q)-2)-starts;
+ endinpos = ((const char *)e)-starts;
goto utf16Error;
if (0xD800 <= ch && ch <= 0xDBFF) {
else {
errmsg = "illegal UTF-16 surrogate";
+ startinpos = (((const char *)q)-4)-starts;
+ endinpos = startinpos+2;
goto utf16Error;
errmsg = "illegal encoding";
+ startinpos = (((const char *)q)-2)-starts;
+ endinpos = startinpos+2;
/* Fall through to report the error */
- if (utf16_decoding_error(&p, errors, errmsg))
+ outpos = p-PyUnicode_AS_UNICODE(unicode);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "utf16", errmsg,
+ starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
+ (PyObject **)&unicode, &outpos, &p))
goto onError;
if (_PyUnicode_Resize(&unicode, p - unicode->str))
goto onError;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return (PyObject *)unicode;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
/* --- Unicode Escape Codec ----------------------------------------------- */
-int unicodeescape_decoding_error(Py_UNICODE **x,
- const char *errors,
- const char *details)
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "Unicode-Escape decoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- (*x)++;
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "Unicode-Escape decoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
- }
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
int size,
const char *errors)
+ const char *starts = s;
+ int startinpos;
+ int endinpos;
+ int outpos;
+ int i;
PyUnicodeObject *v;
- Py_UNICODE *p, *buf;
+ Py_UNICODE *p;
const char *end;
char* message;
Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
/* Escaped strings will always be longer than the resulting
Unicode string, so we start with size here and then reduce the
- length after conversion to the true value. */
+ length after conversion to the true value.
+ (but if the error callback returns a long replacement string
+ we'll have to allocate more space) */
v = _PyUnicode_New(size);
if (v == NULL)
goto onError;
if (size == 0)
return (PyObject *)v;
- p = buf = PyUnicode_AS_UNICODE(v);
+ p = PyUnicode_AS_UNICODE(v);
end = s + size;
while (s < end) {
unsigned char c;
- int i, digits;
+ int digits;
/* Non-escape characters are interpreted as Unicode ordinals */
if (*s != '\\') {
+ startinpos = s-starts;
/* \ - Escapes */
switch (*s++) {
message = "truncated \\UXXXXXXXX escape";
chr = 0;
- for (i = 0; i < digits; i++) {
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ if (s+digits>end) {
+ endinpos = size;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "unicodeescape", "end of string in escape sequence",
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
+ goto onError;
+ goto nextByte;
+ }
+ for (i = 0; i < digits; ++i) {
c = (unsigned char) s[i];
if (!isxdigit(c)) {
- if (unicodeescape_decoding_error(&p, errors, message))
+ endinpos = (s+i+1)-starts;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "unicodeescape", message,
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
goto onError;
- chr = 0xffffffff;
- i++;
- break;
+ goto nextByte;
chr = (chr<<4) & ~0xF;
if (c >= '0' && c <= '9')
s += i;
if (chr == 0xffffffff)
- /* _decoding_error will have already written into the
- target buffer. */
- break;
+ /* _decoding_error will have already written into the
+ target buffer. */
+ break;
/* when we get here, chr is a 32-bit unicode character */
if (chr <= 0xffff)
*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
} else {
- if (unicodeescape_decoding_error(
- &p, errors,
- "illegal Unicode character")
- )
+ endinpos = s-starts;
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "unicodeescape", "illegal Unicode character",
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
goto onError;
goto store;
- if (unicodeescape_decoding_error(&p, errors, message))
+ endinpos = s-starts;
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "unicodeescape", message,
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
goto onError;
if (s > end) {
- if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
+ message = "\\ at end of string";
+ s--;
+ endinpos = s-starts;
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "unicodeescape", message,
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
goto onError;
else {
+ nextByte:
+ ;
- if (_PyUnicode_Resize(&v, (int)(p - buf)))
- goto onError;
+ if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
+ goto onError;
return (PyObject *)v;
"\\N escapes not supported (can't load unicodedata module)"
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
int size,
const char *errors)
+ const char *starts = s;
+ int startinpos;
+ int endinpos;
+ int outpos;
PyUnicodeObject *v;
- Py_UNICODE *p, *buf;
+ Py_UNICODE *p;
const char *end;
const char *bs;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
/* Escaped strings will always be longer than the resulting
Unicode string, so we start with size here and then reduce the
- length after conversion to the true value. */
+ length after conversion to the true value. (But decoding error
+ handler might have to resize the string) */
v = _PyUnicode_New(size);
if (v == NULL)
goto onError;
if (size == 0)
return (PyObject *)v;
- p = buf = PyUnicode_AS_UNICODE(v);
+ p = PyUnicode_AS_UNICODE(v);
end = s + size;
while (s < end) {
unsigned char c;
*p++ = (unsigned char)*s++;
+ startinpos = s-starts;
/* \u-escapes are only interpreted iff the number of leading
backslashes if odd */
/* \uXXXX with 4 hex digits */
- for (x = 0, i = 0; i < 4; i++) {
- c = (unsigned char)s[i];
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ for (x = 0, i = 0; i < 4; ++i, ++s) {
+ c = (unsigned char)*s;
if (!isxdigit(c)) {
- if (unicodeescape_decoding_error(&p, errors,
- "truncated \\uXXXX"))
+ endinpos = s-starts;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "rawunicodeescape", "truncated \\uXXXX",
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
goto onError;
- x = 0xffffffff;
- i++;
- break;
+ goto nextByte;
x = (x<<4) & ~0xF;
if (c >= '0' && c <= '9')
x += 10 + c - 'A';
- s += i;
- if (x != 0xffffffff)
- *p++ = x;
+ *p++ = x;
+ nextByte:
+ ;
- if (_PyUnicode_Resize(&v, (int)(p - buf)))
+ if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
goto onError;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return (PyObject *)v;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
return NULL;
-int latin1_encoding_error(const Py_UNICODE **source,
- char **dest,
- const char *errors,
- const char *details)
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "Latin-1 encoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- **dest = '?';
- (*dest)++;
- return 0;
+/* create or adjust a UnicodeEncodeError */
+static void make_encode_exception(PyObject **exceptionObject,
+ const char *encoding,
+ const Py_UNICODE *unicode, int size,
+ int startpos, int endpos,
+ const char *reason)
+ if (*exceptionObject == NULL) {
+ *exceptionObject = PyUnicodeEncodeError_Create(
+ encoding, unicode, size, startpos, endpos, reason);
else {
- PyErr_Format(PyExc_ValueError,
- "Latin-1 encoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
+ if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
+ goto onError;
+ if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
+ goto onError;
+ if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
+ goto onError;
+ return;
+ onError:
+ Py_DECREF(*exceptionObject);
+ *exceptionObject = NULL;
-PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
- int size,
- const char *errors)
+/* raises a UnicodeEncodeError */
+static void raise_encode_exception(PyObject **exceptionObject,
+ const char *encoding,
+ const Py_UNICODE *unicode, int size,
+ int startpos, int endpos,
+ const char *reason)
- PyObject *repr;
- char *s, *start;
+ make_encode_exception(exceptionObject,
+ encoding, unicode, size, startpos, endpos, reason);
+ if (*exceptionObject != NULL)
+ PyCodec_StrictErrors(*exceptionObject);
- repr = PyString_FromStringAndSize(NULL, size);
- if (repr == NULL)
- return NULL;
- if (size == 0)
- return repr;
+/* error handling callback helper:
+ build arguments, call the callback and check the arguments,
+ put the result into newpos and return the replacement string, which
+ has to be freed by the caller */
+static PyObject *unicode_encode_call_errorhandler(const char *errors,
+ PyObject **errorHandler,
+ const char *encoding, const char *reason,
+ const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
+ int startpos, int endpos,
+ int *newpos)
+ static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
- s = PyString_AS_STRING(repr);
- start = s;
- while (size-- > 0) {
- Py_UNICODE ch = *p++;
- if (ch >= 256) {
- if (latin1_encoding_error(&p, &s, errors,
- "ordinal not in range(256)"))
- goto onError;
+ PyObject *restuple;
+ PyObject *resunicode;
+ if (*errorHandler == NULL) {
+ *errorHandler = PyCodec_LookupError(errors);
+ if (*errorHandler == NULL)
+ return NULL;
+ }
+ make_encode_exception(exceptionObject,
+ encoding, unicode, size, startpos, endpos, reason);
+ if (*exceptionObject == NULL)
+ return NULL;
+ restuple = PyObject_CallFunctionObjArgs(
+ *errorHandler, *exceptionObject, NULL);
+ if (restuple == NULL)
+ return NULL;
+ if (!PyTuple_Check(restuple)) {
+ PyErr_Format(PyExc_TypeError, &argparse[4]);
+ Py_DECREF(restuple);
+ return NULL;
+ }
+ if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
+ &resunicode, newpos)) {
+ Py_DECREF(restuple);
+ return NULL;
+ }
+ if (*newpos<0)
+ *newpos = 0;
+ else if (*newpos>size)
+ *newpos = size;
+ Py_INCREF(resunicode);
+ Py_DECREF(restuple);
+ return resunicode;
+static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
+ int size,
+ const char *errors,
+ int limit)
+ /* output object */
+ PyObject *res;
+ /* pointers to the beginning and end+1 of input */
+ const Py_UNICODE *startp = p;
+ const Py_UNICODE *endp = p + size;
+ /* pointer to the beginning of the unencodable characters */
+ /* const Py_UNICODE *badp = NULL; */
+ /* pointer into the output */
+ char *str;
+ /* current output position */
+ int respos = 0;
+ int ressize;
+ char *encoding = (limit == 256) ? "latin-1" : "ascii";
+ char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ /* the following variable is used for caching string comparisons
+ * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
+ int known_errorHandler = -1;
+ /* allocate enough for a simple encoding without
+ replacements, if we need more, we'll resize */
+ res = PyString_FromStringAndSize(NULL, size);
+ if (res == NULL)
+ goto onError;
+ if (size == 0)
+ return res;
+ str = PyString_AS_STRING(res);
+ ressize = size;
+ while (p<endp) {
+ Py_UNICODE c = *p;
+ /* can we encode this? */
+ if (c<limit) {
+ /* no overflow check, because we know that the space is enough */
+ *str++ = (char)c;
+ ++p;
+ }
+ else {
+ int unicodepos = p-startp;
+ int requiredsize;
+ PyObject *repunicode;
+ int repsize;
+ int newpos;
+ int respos;
+ Py_UNICODE *uni2;
+ /* startpos for collecting unencodable chars */
+ const Py_UNICODE *collstart = p;
+ const Py_UNICODE *collend = p;
+ /* find all unecodable characters */
+ while ((collend < endp) && ((*collend)>=limit))
+ ++collend;
+ /* cache callback name lookup (if not done yet, i.e. it's the first error) */
+ if (known_errorHandler==-1) {
+ if ((errors==NULL) || (!strcmp(errors, "strict")))
+ known_errorHandler = 1;
+ else if (!strcmp(errors, "replace"))
+ known_errorHandler = 2;
+ else if (!strcmp(errors, "ignore"))
+ known_errorHandler = 3;
+ else if (!strcmp(errors, "xmlcharrefreplace"))
+ known_errorHandler = 4;
+ else
+ known_errorHandler = 0;
+ }
+ switch (known_errorHandler) {
+ case 1: /* strict */
+ raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
+ goto onError;
+ case 2: /* replace */
+ while (collstart++<collend)
+ *str++ = '?'; /* fall through */
+ case 3: /* ignore */
+ p = collend;
+ break;
+ case 4: /* xmlcharrefreplace */
+ respos = str-PyString_AS_STRING(res);
+ /* determine replacement size (temporarily (mis)uses p) */
+ for (p = collstart, repsize = 0; p < collend; ++p) {
+ if (*p<10)
+ repsize += 2+1+1;
+ else if (*p<100)
+ repsize += 2+2+1;
+ else if (*p<1000)
+ repsize += 2+3+1;
+ else if (*p<10000)
+ repsize += 2+4+1;
+ else if (*p<100000)
+ repsize += 2+5+1;
+ else if (*p<1000000)
+ repsize += 2+6+1;
+ else
+ repsize += 2+7+1;
+ }
+ requiredsize = respos+repsize+(endp-collend);
+ if (requiredsize > ressize) {
+ if (requiredsize<2*ressize)
+ requiredsize = 2*ressize;
+ if (_PyString_Resize(&res, requiredsize))
+ goto onError;
+ str = PyString_AS_STRING(res) + respos;
+ ressize = requiredsize;
+ }
+ /* generate replacement (temporarily (mis)uses p) */
+ for (p = collstart; p < collend; ++p) {
+ str += sprintf(str, "&#%d;", (int)*p);
+ }
+ p = collend;
+ break;
+ default:
+ repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
+ encoding, reason, startp, size, &exc,
+ collstart-startp, collend-startp, &newpos);
+ if (repunicode == NULL)
+ goto onError;
+ /* need more space? (at least enough for what we
+ have+the replacement+the rest of the string, so
+ we won't have to check space for encodable characters) */
+ respos = str-PyString_AS_STRING(res);
+ repsize = PyUnicode_GET_SIZE(repunicode);
+ requiredsize = respos+repsize+(endp-collend);
+ if (requiredsize > ressize) {
+ if (requiredsize<2*ressize)
+ requiredsize = 2*ressize;
+ if (_PyString_Resize(&res, requiredsize)) {
+ Py_DECREF(repunicode);
+ goto onError;
+ }
+ str = PyString_AS_STRING(res) + respos;
+ ressize = requiredsize;
+ }
+ /* check if there is anything unencodable in the replacement
+ and copy it to the output */
+ for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
+ c = *uni2;
+ if (c >= limit) {
+ raise_encode_exception(&exc, encoding, startp, size,
+ unicodepos, unicodepos+1, reason);
+ Py_DECREF(repunicode);
+ goto onError;
+ }
+ *str = (char)c;
+ }
+ p = startp + newpos;
+ Py_DECREF(repunicode);
+ }
- else
- *s++ = (char)ch;
- /* Resize if error handling skipped some characters */
- if (s - start < PyString_GET_SIZE(repr))
- _PyString_Resize(&repr, s - start);
- return repr;
+ /* Resize if we allocated to much */
+ respos = str-PyString_AS_STRING(res);
+ if (respos<ressize)
+ /* If this falls res will be NULL */
+ _PyString_Resize(&res, respos);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ return res;
- onError:
- Py_DECREF(repr);
+ onError:
+ Py_XDECREF(res);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
+PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
+ int size,
+ const char *errors)
+ return unicode_encode_ucs1(p, size, errors, 256);
PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
if (!PyUnicode_Check(unicode)) {
/* --- 7-bit ASCII Codec -------------------------------------------------- */
-int ascii_decoding_error(const char **source,
- Py_UNICODE **dest,
- const char *errors,
- const char *details)
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "ASCII decoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- (*dest)++;
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "ASCII decoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
- }
PyObject *PyUnicode_DecodeASCII(const char *s,
int size,
const char *errors)
+ const char *starts = s;
PyUnicodeObject *v;
+ int startinpos;
+ int endinpos;
+ int outpos;
+ const char *e;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
if (size == 1 && *(unsigned char*)s < 128) {
if (size == 0)
return (PyObject *)v;
p = PyUnicode_AS_UNICODE(v);
- while (size-- > 0) {
- register unsigned char c;
- c = (unsigned char)*s++;
- if (c < 128)
+ e = s + size;
+ while (s < e) {
+ register unsigned char c = (unsigned char)*s;
+ if (c < 128) {
*p++ = c;
- else if (ascii_decoding_error(&s, &p, errors,
- "ordinal not in range(128)"))
+ ++s;
+ }
+ else {
+ startinpos = s-starts;
+ endinpos = startinpos + 1;
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "ascii", "ordinal not in range(128)",
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
goto onError;
+ }
if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
goto onError;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return (PyObject *)v;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
-int ascii_encoding_error(const Py_UNICODE **source,
- char **dest,
- const char *errors,
- const char *details)
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "ASCII encoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- **dest = '?';
- (*dest)++;
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "ASCII encoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
- }
PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
int size,
const char *errors)
- PyObject *repr;
- char *s, *start;
- repr = PyString_FromStringAndSize(NULL, size);
- if (repr == NULL)
- return NULL;
- if (size == 0)
- return repr;
- s = PyString_AS_STRING(repr);
- start = s;
- while (size-- > 0) {
- Py_UNICODE ch = *p++;
- if (ch >= 128) {
- if (ascii_encoding_error(&p, &s, errors,
- "ordinal not in range(128)"))
- goto onError;
- }
- else
- *s++ = (char)ch;
- }
- /* Resize if error handling skipped some characters */
- if (s - start < PyString_GET_SIZE(repr))
- _PyString_Resize(&repr, s - start);
- return repr;
- onError:
- Py_DECREF(repr);
- return NULL;
+ return unicode_encode_ucs1(p, size, errors, 128);
PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
/* --- Character Mapping Codec -------------------------------------------- */
-int charmap_decoding_error(const char **source,
- Py_UNICODE **dest,
- const char *errors,
- const char *details)
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "charmap decoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- (*dest)++;
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "charmap decoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
- }
PyObject *PyUnicode_DecodeCharmap(const char *s,
int size,
PyObject *mapping,
const char *errors)
+ const char *starts = s;
+ int startinpos;
+ int endinpos;
+ int outpos;
+ const char *e;
PyUnicodeObject *v;
int extrachars = 0;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
/* Default to Latin-1 */
if (mapping == NULL)
if (size == 0)
return (PyObject *)v;
p = PyUnicode_AS_UNICODE(v);
- while (size-- > 0) {
- unsigned char ch = *s++;
+ e = s + size;
+ while (s < e) {
+ unsigned char ch = *s;
PyObject *w, *x;
/* Get mapping (char ordinal -> integer, Unicode char or None) */
else if (x == Py_None) {
/* undefined mapping */
- if (charmap_decoding_error(&s, &p, errors,
- "character maps to <undefined>")) {
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ startinpos = s-starts;
+ endinpos = startinpos+1;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "charmap", "character maps to <undefined>",
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p)) {
goto onError;
+ continue;
else if (PyUnicode_Check(x)) {
int targetsize = PyUnicode_GET_SIZE(x);
goto onError;
+ ++s;
if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
goto onError;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return (PyObject *)v;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
-int charmap_encoding_error(const Py_UNICODE **source,
- char **dest,
- const char *errors,
- const char *details)
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "charmap encoding error: %.400s",
- details);
- return -1;
+/* Lookup the character ch in the mapping. If the character
+ can't be found, Py_None is returned (or NULL, if another
+ error occured). */
+static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
+ PyObject *w = PyInt_FromLong((long)c);
+ PyObject *x;
+ if (w == NULL)
+ return NULL;
+ x = PyObject_GetItem(mapping, w);
+ Py_DECREF(w);
+ if (x == NULL) {
+ if (PyErr_ExceptionMatches(PyExc_LookupError)) {
+ /* No mapping found means: mapping is undefined. */
+ PyErr_Clear();
+ x = Py_None;
+ Py_INCREF(x);
+ return x;
+ } else
+ return NULL;
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
+ else if (PyInt_Check(x)) {
+ long value = PyInt_AS_LONG(x);
+ if (value < 0 || value > 255) {
+ PyErr_SetString(PyExc_TypeError,
+ "character mapping must be in range(256)");
+ Py_DECREF(x);
+ return NULL;
+ }
+ return x;
- else if (strcmp(errors,"replace") == 0) {
- **dest = '?';
- (*dest)++;
- return 0;
+ else if (PyString_Check(x))
+ return x;
+ else {
+ /* wrong return value */
+ PyErr_SetString(PyExc_TypeError,
+ "character mapping must return integer, None or str");
+ Py_DECREF(x);
+ return NULL;
+/* lookup the character, put the result in the output string and adjust
+ various state variables. Reallocate the output string if not enough
+ space is available. Return a new reference to the object that
+ was put in the output buffer, or Py_None, if the mapping was undefined
+ (in which case no character was written) or NULL, if a
+ reallocation error ocurred. The called must decref the result */
+PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
+ PyObject **outobj, int *outpos)
+ PyObject *rep = charmapencode_lookup(c, mapping);
+ if (rep==NULL)
+ return NULL;
+ else if (rep==Py_None)
+ return rep;
else {
- PyErr_Format(PyExc_ValueError,
- "charmap encoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
+ char *outstart = PyString_AS_STRING(*outobj);
+ int outsize = PyString_GET_SIZE(*outobj);
+ if (PyInt_Check(rep)) {
+ int requiredsize = *outpos+1;
+ if (outsize<requiredsize) {
+ /* exponentially overallocate to minimize reallocations */
+ if (requiredsize < 2*outsize)
+ requiredsize = 2*outsize;
+ if (_PyString_Resize(outobj, requiredsize)) {
+ Py_DECREF(rep);
+ return NULL;
+ }
+ outstart = PyString_AS_STRING(*outobj);
+ }
+ outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
+ }
+ else {
+ const char *repchars = PyString_AS_STRING(rep);
+ int repsize = PyString_GET_SIZE(rep);
+ int requiredsize = *outpos+repsize;
+ if (outsize<requiredsize) {
+ /* exponentially overallocate to minimize reallocations */
+ if (requiredsize < 2*outsize)
+ requiredsize = 2*outsize;
+ if (_PyString_Resize(outobj, requiredsize)) {
+ Py_DECREF(rep);
+ return NULL;
+ }
+ outstart = PyString_AS_STRING(*outobj);
+ }
+ memcpy(outstart + *outpos, repchars, repsize);
+ *outpos += repsize;
+ }
+ }
+ return rep;
+/* handle an error in PyUnicode_EncodeCharmap
+ Return 0 on success, -1 on error */
+int charmap_encoding_error(
+ const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
+ PyObject **exceptionObject,
+ int *known_errorHandler, PyObject *errorHandler, const char *errors,
+ PyObject **res, int *respos)
+ PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
+ int repsize;
+ int newpos;
+ Py_UNICODE *uni2;
+ /* startpos for collecting unencodable chars */
+ int collstartpos = *inpos;
+ int collendpos = *inpos+1;
+ int collpos;
+ char *encoding = "charmap";
+ char *reason = "character maps to <undefined>";
+ PyObject *x;
+ /* find all unencodable characters */
+ while (collendpos < size) {
+ x = charmapencode_lookup(p[collendpos], mapping);
+ if (x==NULL)
+ return -1;
+ else if (x!=Py_None) {
+ Py_DECREF(x);
+ break;
+ }
+ Py_DECREF(x);
+ ++collendpos;
+ }
+ /* cache callback name lookup
+ * (if not done yet, i.e. it's the first error) */
+ if (*known_errorHandler==-1) {
+ if ((errors==NULL) || (!strcmp(errors, "strict")))
+ *known_errorHandler = 1;
+ else if (!strcmp(errors, "replace"))
+ *known_errorHandler = 2;
+ else if (!strcmp(errors, "ignore"))
+ *known_errorHandler = 3;
+ else if (!strcmp(errors, "xmlcharrefreplace"))
+ *known_errorHandler = 4;
+ else
+ *known_errorHandler = 0;
+ }
+ switch (*known_errorHandler) {
+ case 1: /* strict */
+ raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+ return -1;
+ case 2: /* replace */
+ for (collpos = collstartpos; collpos<collendpos; ++collpos) {
+ x = charmapencode_output('?', mapping, res, respos);
+ if (x==NULL) {
+ return -1;
+ }
+ else if (x==Py_None) {
+ Py_DECREF(x);
+ raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+ return -1;
+ }
+ Py_DECREF(x);
+ }
+ /* fall through */
+ case 3: /* ignore */
+ *inpos = collendpos;
+ break;
+ case 4: /* xmlcharrefreplace */
+ /* generate replacement (temporarily (mis)uses p) */
+ for (collpos = collstartpos; collpos < collendpos; ++collpos) {
+ char buffer[2+29+1+1];
+ char *cp;
+ sprintf(buffer, "&#%d;", (int)p[collpos]);
+ for (cp = buffer; *cp; ++cp) {
+ x = charmapencode_output(*cp, mapping, res, respos);
+ if (x==NULL)
+ return -1;
+ else if (x==Py_None) {
+ Py_DECREF(x);
+ raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+ return -1;
+ }
+ Py_DECREF(x);
+ }
+ }
+ *inpos = collendpos;
+ break;
+ default:
+ repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
+ encoding, reason, p, size, exceptionObject,
+ collstartpos, collendpos, &newpos);
+ if (repunicode == NULL)
+ return -1;
+ /* generate replacement */
+ repsize = PyUnicode_GET_SIZE(repunicode);
+ for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
+ x = charmapencode_output(*uni2, mapping, res, respos);
+ if (x==NULL) {
+ Py_DECREF(repunicode);
+ return -1;
+ }
+ else if (x==Py_None) {
+ Py_DECREF(repunicode);
+ Py_DECREF(x);
+ raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+ return -1;
+ }
+ Py_DECREF(x);
+ }
+ *inpos = newpos;
+ Py_DECREF(repunicode);
+ return 0;
PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
PyObject *mapping,
const char *errors)
- PyObject *v;
- char *s;
- int extrachars = 0;
+ /* output object */
+ PyObject *res = NULL;
+ /* current input position */
+ int inpos = 0;
+ /* current output position */
+ int respos = 0;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ /* the following variable is used for caching string comparisons
+ * -1=not initialized, 0=unknown, 1=strict, 2=replace,
+ * 3=ignore, 4=xmlcharrefreplace */
+ int known_errorHandler = -1;
/* Default to Latin-1 */
if (mapping == NULL)
return PyUnicode_EncodeLatin1(p, size, errors);
- v = PyString_FromStringAndSize(NULL, size);
- if (v == NULL)
- return NULL;
+ /* allocate enough for a simple encoding without
+ replacements, if we need more, we'll resize */
+ res = PyString_FromStringAndSize(NULL, size);
+ if (res == NULL)
+ goto onError;
if (size == 0)
- return v;
- s = PyString_AS_STRING(v);
- while (size-- > 0) {
- Py_UNICODE ch = *p++;
- PyObject *w, *x;
+ return res;
- /* Get mapping (Unicode ordinal -> string char, integer or None) */
- w = PyInt_FromLong((long)ch);
- if (w == NULL)
+ while (inpos<size) {
+ /* try to encode it */
+ PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
+ if (x==NULL) /* error */
goto onError;
- x = PyObject_GetItem(mapping, w);
- Py_DECREF(w);
- if (x == NULL) {
- if (PyErr_ExceptionMatches(PyExc_LookupError)) {
- /* No mapping found means: mapping is undefined. */
- PyErr_Clear();
- x = Py_None;
- Py_INCREF(x);
- } else
+ if (x==Py_None) { /* unencodable character */
+ if (charmap_encoding_error(p, size, &inpos, mapping,
+ &exc,
+ &known_errorHandler, errorHandler, errors,
+ &res, &respos))
goto onError;
+ else
+ /* done with this character => adjust input position */
+ ++inpos;
+ Py_DECREF(x);
+ }
- /* Apply mapping */
- if (PyInt_Check(x)) {
- long value = PyInt_AS_LONG(x);
- if (value < 0 || value > 255) {
- PyErr_SetString(PyExc_TypeError,
- "character mapping must be in range(256)");
- Py_DECREF(x);
- goto onError;
- }
- *s++ = (char)value;
- }
- else if (x == Py_None) {
- /* undefined mapping */
- if (charmap_encoding_error(&p, &s, errors,
- "character maps to <undefined>")) {
- Py_DECREF(x);
- goto onError;
- }
- }
- else if (PyString_Check(x)) {
- int targetsize = PyString_GET_SIZE(x);
- if (targetsize == 1)
- /* 1-1 mapping */
- *s++ = *PyString_AS_STRING(x);
- else if (targetsize > 1) {
- /* 1-n mapping */
- if (targetsize > extrachars) {
- /* resize first */
- int oldpos = (int)(s - PyString_AS_STRING(v));
- int needed = (targetsize - extrachars) + \
- (targetsize << 2);
- extrachars += needed;
- if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
- Py_DECREF(x);
- goto onError;
- }
- s = PyString_AS_STRING(v) + oldpos;
- }
- memcpy(s, PyString_AS_STRING(x), targetsize);
- s += targetsize;
- extrachars -= targetsize;
- }
- /* 1-0 mapping: skip the character */
- }
- else {
- /* wrong return value */
- PyErr_SetString(PyExc_TypeError,
- "character mapping must return integer, None or unicode");
- Py_DECREF(x);
+ /* Resize if we allocated to much */
+ if (respos<PyString_GET_SIZE(res)) {
+ if (_PyString_Resize(&res, respos))
goto onError;
- }
- Py_DECREF(x);
- if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
- _PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)));
- return v;
+ Py_XDECREF(exc);
+ Py_XDECREF(errorHandler);
+ return res;
- onError:
- Py_XDECREF(v);
+ onError:
+ Py_XDECREF(res);
+ Py_XDECREF(exc);
+ Py_XDECREF(errorHandler);
return NULL;
+/* create or adjust a UnicodeTranslateError */
+static void make_translate_exception(PyObject **exceptionObject,
+ const Py_UNICODE *unicode, int size,
+ int startpos, int endpos,
+ const char *reason)
+ if (*exceptionObject == NULL) {
+ *exceptionObject = PyUnicodeTranslateError_Create(
+ unicode, size, startpos, endpos, reason);
+ }
+ else {
+ if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
+ goto onError;
+ if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
+ goto onError;
+ if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
+ goto onError;
+ return;
+ onError:
+ Py_DECREF(*exceptionObject);
+ *exceptionObject = NULL;
+ }
+/* raises a UnicodeTranslateError */
+static void raise_translate_exception(PyObject **exceptionObject,
+ const Py_UNICODE *unicode, int size,
+ int startpos, int endpos,
+ const char *reason)
+ make_translate_exception(exceptionObject,
+ unicode, size, startpos, endpos, reason);
+ if (*exceptionObject != NULL)
+ PyCodec_StrictErrors(*exceptionObject);
+/* error handling callback helper:
+ build arguments, call the callback and check the arguments,
+ put the result into newpos and return the replacement string, which
+ has to be freed by the caller */
+static PyObject *unicode_translate_call_errorhandler(const char *errors,
+ PyObject **errorHandler,
+ const char *reason,
+ const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
+ int startpos, int endpos,
+ int *newpos)
+ static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
+ PyObject *restuple;
+ PyObject *resunicode;
+ if (*errorHandler == NULL) {
+ *errorHandler = PyCodec_LookupError(errors);
+ if (*errorHandler == NULL)
+ return NULL;
+ }
+ make_translate_exception(exceptionObject,
+ unicode, size, startpos, endpos, reason);
+ if (*exceptionObject == NULL)
+ return NULL;
+ restuple = PyObject_CallFunctionObjArgs(
+ *errorHandler, *exceptionObject, NULL);
+ if (restuple == NULL)
+ return NULL;
+ if (!PyTuple_Check(restuple)) {
+ PyErr_Format(PyExc_TypeError, &argparse[4]);
+ Py_DECREF(restuple);
+ return NULL;
+ }
+ if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
+ &resunicode, newpos)) {
+ Py_DECREF(restuple);
+ return NULL;
+ }
+ if (*newpos<0)
+ *newpos = 0;
+ else if (*newpos>size)
+ *newpos = size;
+ Py_INCREF(resunicode);
+ Py_DECREF(restuple);
+ return resunicode;
+/* Lookup the character ch in the mapping and put the result in result,
+ which must be decrefed by the caller.
+ Return 0 on success, -1 on error */
-int translate_error(const Py_UNICODE **source,
- Py_UNICODE **dest,
- const char *errors,
- const char *details)
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "translate error: %.400s",
- details);
- return -1;
+int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
+ PyObject *w = PyInt_FromLong((long)c);
+ PyObject *x;
+ if (w == NULL)
+ return -1;
+ x = PyObject_GetItem(mapping, w);
+ Py_DECREF(w);
+ if (x == NULL) {
+ if (PyErr_ExceptionMatches(PyExc_LookupError)) {
+ /* No mapping found means: use 1:1 mapping. */
+ PyErr_Clear();
+ *result = NULL;
+ return 0;
+ } else
+ return -1;
- else if (strcmp(errors,"ignore") == 0) {
+ else if (x == Py_None) {
+ *result = x;
return 0;
- else if (strcmp(errors,"replace") == 0) {
- **dest = '?';
- (*dest)++;
+ else if (PyInt_Check(x)) {
+ long value = PyInt_AS_LONG(x);
+ long max = PyUnicode_GetMax();
+ if (value < 0 || value > max) {
+ PyErr_Format(PyExc_TypeError,
+ "character mapping must be in range(0x%lx)", max+1);
+ Py_DECREF(x);
+ return -1;
+ }
+ *result = x;
+ return 0;
+ }
+ else if (PyUnicode_Check(x)) {
+ *result = x;
return 0;
else {
- PyErr_Format(PyExc_ValueError,
- "translate error; "
- "unknown error handling code: %.400s",
- errors);
+ /* wrong return value */
+ PyErr_SetString(PyExc_TypeError,
+ "character mapping must return integer, None or unicode");
+ return -1;
+ }
+/* ensure that *outobj is at least requiredsize characters long,
+if not reallocate and adjust various state variables.
+Return 0 on success, -1 on error */
+int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
+ int requiredsize)
+ if (requiredsize > *outsize) {
+ /* remember old output position */
+ int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
+ /* exponentially overallocate to minimize reallocations */
+ if (requiredsize < 2 * *outsize)
+ requiredsize = 2 * *outsize;
+ if (_PyUnicode_Resize(outobj, requiredsize))
+ return -1;
+ *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
+ *outsize = requiredsize;
+ }
+ return 0;
+/* lookup the character, put the result in the output string and adjust
+ various state variables. Return a new reference to the object that
+ was put in the output buffer in *result, or Py_None, if the mapping was
+ undefined (in which case no character was written).
+ The called must decref result.
+ Return 0 on success, -1 on error. */
+int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
+ PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
+ if (charmaptranslate_lookup(c, mapping, res))
return -1;
+ if (*res==NULL) {
+ /* not found => default to 1:1 mapping */
+ *(*outp)++ = (Py_UNICODE)c;
+ }
+ else if (*res==Py_None)
+ ;
+ else if (PyInt_Check(*res)) {
+ /* no overflow check, because we know that the space is enough */
+ *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
+ }
+ else if (PyUnicode_Check(*res)) {
+ int repsize = PyUnicode_GET_SIZE(*res);
+ if (repsize==1) {
+ /* no overflow check, because we know that the space is enough */
+ *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
+ }
+ else if (repsize!=0) {
+ /* more than one character */
+ int requiredsize = *outsize + repsize - 1;
+ if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
+ return -1;
+ memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
+ *outp += repsize;
+ }
+ else
+ return -1;
+ return 0;
-PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
+PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
int size,
PyObject *mapping,
const char *errors)
- PyUnicodeObject *v;
- Py_UNICODE *p;
+ /* output object */
+ PyObject *res = NULL;
+ /* pointers to the beginning and end+1 of input */
+ const Py_UNICODE *startp = p;
+ const Py_UNICODE *endp = p + size;
+ /* pointer into the output */
+ Py_UNICODE *str;
+ /* current output position */
+ int respos = 0;
+ int ressize;
+ char *reason = "character maps to <undefined>";
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ /* the following variable is used for caching string comparisons
+ * -1=not initialized, 0=unknown, 1=strict, 2=replace,
+ * 3=ignore, 4=xmlcharrefreplace */
+ int known_errorHandler = -1;
if (mapping == NULL) {
return NULL;
- /* Output will never be longer than input */
- v = _PyUnicode_New(size);
- if (v == NULL)
- goto onError;
- if (size == 0)
- goto done;
- p = PyUnicode_AS_UNICODE(v);
- while (size-- > 0) {
- Py_UNICODE ch = *s++;
- PyObject *w, *x;
- /* Get mapping */
- w = PyInt_FromLong(ch);
- if (w == NULL)
- goto onError;
- x = PyObject_GetItem(mapping, w);
- Py_DECREF(w);
- if (x == NULL) {
- if (PyErr_ExceptionMatches(PyExc_LookupError)) {
- /* No mapping found: default to 1-1 mapping */
- PyErr_Clear();
- *p++ = ch;
- continue;
- }
+ /* allocate enough for a simple 1:1 translation without
+ replacements, if we need more, we'll resize */
+ res = PyUnicode_FromUnicode(NULL, size);
+ if (res == NULL)
+ goto onError;
+ if (size == 0)
+ return res;
+ str = PyUnicode_AS_UNICODE(res);
+ ressize = size;
+ while (p<endp) {
+ /* try to encode it */
+ PyObject *x = NULL;
+ if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
+ Py_XDECREF(x);
goto onError;
- /* Apply mapping */
- if (PyInt_Check(x))
- *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
- else if (x == Py_None) {
- /* undefined mapping */
- if (translate_error(&s, &p, errors,
- "character maps to <undefined>")) {
- Py_DECREF(x);
- goto onError;
+ if (x!=Py_None) /* it worked => adjust input pointer */
+ ++p;
+ else { /* untranslatable character */
+ PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
+ int repsize;
+ int newpos;
+ Py_UNICODE *uni2;
+ /* startpos for collecting untranslatable chars */
+ const Py_UNICODE *collstart = p;
+ const Py_UNICODE *collend = p+1;
+ const Py_UNICODE *coll;
+ Py_XDECREF(x);
+ /* find all untranslatable characters */
+ while (collend < endp) {
+ if (charmaptranslate_lookup(*collend, mapping, &x))
+ goto onError;
+ Py_XDECREF(x);
+ if (x!=Py_None)
+ break;
+ ++collend;
- }
- else if (PyUnicode_Check(x)) {
- if (PyUnicode_GET_SIZE(x) != 1) {
- /* 1-n mapping */
- PyErr_SetString(PyExc_NotImplementedError,
- "1-n mappings are currently not implemented");
- Py_DECREF(x);
- goto onError;
+ /* cache callback name lookup
+ * (if not done yet, i.e. it's the first error) */
+ if (known_errorHandler==-1) {
+ if ((errors==NULL) || (!strcmp(errors, "strict")))
+ known_errorHandler = 1;
+ else if (!strcmp(errors, "replace"))
+ known_errorHandler = 2;
+ else if (!strcmp(errors, "ignore"))
+ known_errorHandler = 3;
+ else if (!strcmp(errors, "xmlcharrefreplace"))
+ known_errorHandler = 4;
+ else
+ known_errorHandler = 0;
+ }
+ switch (known_errorHandler) {
+ case 1: /* strict */
+ raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
+ goto onError;
+ case 2: /* replace */
+ /* No need to check for space, this is a 1:1 replacement */
+ for (coll = collstart; coll<collend; ++coll)
+ *str++ = '?';
+ /* fall through */
+ case 3: /* ignore */
+ p = collend;
+ break;
+ case 4: /* xmlcharrefreplace */
+ /* generate replacement (temporarily (mis)uses p) */
+ for (p = collstart; p < collend; ++p) {
+ char buffer[2+29+1+1];
+ char *cp;
+ sprintf(buffer, "&#%d;", (int)*p);
+ if (charmaptranslate_makespace(&res, &str, &ressize,
+ (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
+ goto onError;
+ for (cp = buffer; *cp; ++cp)
+ *str++ = *cp;
+ }
+ p = collend;
+ break;
+ default:
+ repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
+ reason, startp, size, &exc,
+ collstart-startp, collend-startp, &newpos);
+ if (repunicode == NULL)
+ goto onError;
+ /* generate replacement */
+ repsize = PyUnicode_GET_SIZE(repunicode);
+ if (charmaptranslate_makespace(&res, &str, &ressize,
+ (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
+ Py_DECREF(repunicode);
+ goto onError;
+ }
+ for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
+ *str++ = *uni2;
+ p = startp + newpos;
+ Py_DECREF(repunicode);
- *p++ = *PyUnicode_AS_UNICODE(x);
- }
- else {
- /* wrong return value */
- PyErr_SetString(PyExc_TypeError,
- "translate mapping must return integer, None or unicode");
- Py_DECREF(x);
- goto onError;
- Py_DECREF(x);
- if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
- if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
+ /* Resize if we allocated to much */
+ respos = str-PyUnicode_AS_UNICODE(res);
+ if (respos<ressize) {
+ if (_PyUnicode_Resize(&res, respos))
goto onError;
+ }
+ Py_XDECREF(exc);
+ Py_XDECREF(errorHandler);
+ return res;
- done:
- return (PyObject *)v;
- onError:
- Py_XDECREF(v);
+ onError:
+ Py_XDECREF(res);
+ Py_XDECREF(exc);
+ Py_XDECREF(errorHandler);
return NULL;
const char *errors)
Py_UNICODE *p, *end;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ const char *encoding = "decimal";
+ const char *reason = "invalid decimal Unicode string";
+ /* the following variable is used for caching string comparisons
+ * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
+ int known_errorHandler = -1;
if (output == NULL) {
p = s;
end = s + length;
while (p < end) {
- register Py_UNICODE ch = *p++;
+ register Py_UNICODE ch = *p;
int decimal;
+ PyObject *repunicode;
+ int repsize;
+ int newpos;
+ Py_UNICODE *uni2;
+ Py_UNICODE *collstart;
+ Py_UNICODE *collend;
*output++ = ' ';
+ ++p;
decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal >= 0) {
*output++ = '0' + decimal;
+ ++p;
if (0 < ch && ch < 256) {
*output++ = (char)ch;
+ ++p;
- /* All other characters are considered invalid */
- if (errors == NULL || strcmp(errors, "strict") == 0) {
- PyErr_SetString(PyExc_ValueError,
- "invalid decimal Unicode string");
- goto onError;
+ /* All other characters are considered unencodable */
+ collstart = p;
+ collend = p+1;
+ while (collend < end) {
+ if ((0 < *collend && *collend < 256) ||
+ !Py_UNICODE_ISSPACE(*collend) ||
+ break;
- else if (strcmp(errors, "ignore") == 0)
- continue;
- else if (strcmp(errors, "replace") == 0) {
- *output++ = '?';
- continue;
+ /* cache callback name lookup
+ * (if not done yet, i.e. it's the first error) */
+ if (known_errorHandler==-1) {
+ if ((errors==NULL) || (!strcmp(errors, "strict")))
+ known_errorHandler = 1;
+ else if (!strcmp(errors, "replace"))
+ known_errorHandler = 2;
+ else if (!strcmp(errors, "ignore"))
+ known_errorHandler = 3;
+ else if (!strcmp(errors, "xmlcharrefreplace"))
+ known_errorHandler = 4;
+ else
+ known_errorHandler = 0;
+ }
+ switch (known_errorHandler) {
+ case 1: /* strict */
+ raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
+ goto onError;
+ case 2: /* replace */
+ for (p = collstart; p < collend; ++p)
+ *output++ = '?';
+ /* fall through */
+ case 3: /* ignore */
+ p = collend;
+ break;
+ case 4: /* xmlcharrefreplace */
+ /* generate replacement (temporarily (mis)uses p) */
+ for (p = collstart; p < collend; ++p)
+ output += sprintf(output, "&#%d;", (int)*p);
+ p = collend;
+ break;
+ default:
+ repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
+ encoding, reason, s, length, &exc,
+ collstart-s, collend-s, &newpos);
+ if (repunicode == NULL)
+ goto onError;
+ /* generate replacement */
+ repsize = PyUnicode_GET_SIZE(repunicode);
+ for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
+ Py_UNICODE ch = *uni2;
+ *output++ = ' ';
+ else {
+ decimal = Py_UNICODE_TODECIMAL(ch);
+ if (decimal >= 0)
+ *output++ = '0' + decimal;
+ else if (0 < ch && ch < 256)
+ *output++ = (char)ch;
+ else {
+ Py_DECREF(repunicode);
+ raise_encode_exception(&exc, encoding,
+ s, length, collstart-s, collend-s, reason);
+ goto onError;
+ }
+ }
+ }
+ p = s + newpos;
+ Py_DECREF(repunicode);
/* 0-terminate the output string */
*output++ = '\0';
+ Py_XDECREF(exc);
+ Py_XDECREF(errorHandler);
return 0;
+ Py_XDECREF(exc);
+ Py_XDECREF(errorHandler);
return -1;
Return an encoded string version of S. Default encoding is the current\n\
default string encoding. errors may be given to set a different error\n\
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
-a ValueError. Other possible values are 'ignore' and 'replace'.");
+a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
+'xmlcharrefreplace' as well as any other name registered with\n\
+codecs.register_error that can handle UnicodeEncodeErrors.");
static PyObject *
unicode_encode(PyUnicodeObject *self, PyObject *args)
| +-- ValueError\n\
| | |\n\
| | +-- UnicodeError\n\
+ | | |\n\
+ | | +-- UnicodeEncodeError\n\
+ | | +-- UnicodeDecodeError\n\
+ | | +-- UnicodeTranslateError\n\
| |\n\
| +-- ReferenceError\n\
| +-- SystemError\n\
+int get_int(PyObject *exc, const char *name, int *value)
+ PyObject *attr = PyObject_GetAttrString(exc, (char *)name);
+ if (!attr)
+ return -1;
+ if (!PyInt_Check(attr)) {
+ PyErr_Format(PyExc_TypeError, "%s attribute must be int", name);
+ Py_DECREF(attr);
+ return -1;
+ }
+ *value = PyInt_AS_LONG(attr);
+ Py_DECREF(attr);
+ return 0;
+int set_int(PyObject *exc, const char *name, int value)
+ PyObject *obj = PyInt_FromLong(value);
+ int result;
+ if (!obj)
+ return -1;
+ result = PyObject_SetAttrString(exc, (char *)name, obj);
+ Py_DECREF(obj);
+ return result;
+PyObject *get_string(PyObject *exc, const char *name)
+ PyObject *attr = PyObject_GetAttrString(exc, (char *)name);
+ if (!attr)
+ return NULL;
+ if (!PyString_Check(attr)) {
+ PyErr_Format(PyExc_TypeError, "%s attribute must be str", name);
+ Py_DECREF(attr);
+ return NULL;
+ }
+ return attr;
+int set_string(PyObject *exc, const char *name, const char *value)
+ PyObject *obj = PyString_FromString(value);
+ int result;
+ if (!obj)
+ return -1;
+ result = PyObject_SetAttrString(exc, (char *)name, obj);
+ Py_DECREF(obj);
+ return result;
+PyObject *get_unicode(PyObject *exc, const char *name)
+ PyObject *attr = PyObject_GetAttrString(exc, (char *)name);
+ if (!attr)
+ return NULL;
+ if (!PyUnicode_Check(attr)) {
+ PyErr_Format(PyExc_TypeError, "%s attribute must be unicode", name);
+ Py_DECREF(attr);
+ return NULL;
+ }
+ return attr;
+PyObject * PyUnicodeEncodeError_GetEncoding(PyObject *exc)
+ return get_string(exc, "encoding");
+PyObject * PyUnicodeDecodeError_GetEncoding(PyObject *exc)
+ return get_string(exc, "encoding");
+PyObject * PyUnicodeTranslateError_GetEncoding(PyObject *exc)
+ return get_string(exc, "encoding");
+PyObject *PyUnicodeEncodeError_GetObject(PyObject *exc)
+ return get_unicode(exc, "object");
+PyObject *PyUnicodeDecodeError_GetObject(PyObject *exc)
+ return get_string(exc, "object");
+PyObject *PyUnicodeTranslateError_GetObject(PyObject *exc)
+ return get_unicode(exc, "object");
+int PyUnicodeEncodeError_GetStart(PyObject *exc, int *start)
+ if (!get_int(exc, "start", start)) {
+ PyObject *object = PyUnicodeEncodeError_GetObject(exc);
+ int size;
+ if (!object)
+ return -1;
+ size = PyUnicode_GET_SIZE(object);
+ if (*start<0)
+ *start = 0;
+ if (*start>=size)
+ *start = size-1;
+ Py_DECREF(object);
+ return 0;
+ }
+ return -1;
+int PyUnicodeDecodeError_GetStart(PyObject *exc, int *start)
+ if (!get_int(exc, "start", start)) {
+ PyObject *object = PyUnicodeDecodeError_GetObject(exc);
+ int size;
+ if (!object)
+ return -1;
+ size = PyString_GET_SIZE(object);
+ if (*start<0)
+ *start = 0;
+ if (*start>=size)
+ *start = size-1;
+ Py_DECREF(object);
+ return 0;
+ }
+ return -1;
+int PyUnicodeTranslateError_GetStart(PyObject *exc, int *start)
+ return PyUnicodeEncodeError_GetStart(exc, start);
+int PyUnicodeEncodeError_SetStart(PyObject *exc, int start)
+ return set_int(exc, "start", start);
+int PyUnicodeDecodeError_SetStart(PyObject *exc, int start)
+ return set_int(exc, "start", start);
+int PyUnicodeTranslateError_SetStart(PyObject *exc, int start)
+ return set_int(exc, "start", start);
+int PyUnicodeEncodeError_GetEnd(PyObject *exc, int *end)
+ if (!get_int(exc, "end", end)) {
+ PyObject *object = PyUnicodeEncodeError_GetObject(exc);
+ int size;
+ if (!object)
+ return -1;
+ size = PyUnicode_GET_SIZE(object);
+ if (*end<1)
+ *end = 1;
+ if (*end>size)
+ *end = size;
+ Py_DECREF(object);
+ return 0;
+ }
+ return -1;
+int PyUnicodeDecodeError_GetEnd(PyObject *exc, int *end)
+ if (!get_int(exc, "end", end)) {
+ PyObject *object = PyUnicodeDecodeError_GetObject(exc);
+ int size;
+ if (!object)
+ return -1;
+ size = PyString_GET_SIZE(object);
+ if (*end<1)
+ *end = 1;
+ if (*end>size)
+ *end = size;
+ Py_DECREF(object);
+ return 0;
+ }
+ return -1;
+int PyUnicodeTranslateError_GetEnd(PyObject *exc, int *start)
+ return PyUnicodeEncodeError_GetEnd(exc, start);
+int PyUnicodeEncodeError_SetEnd(PyObject *exc, int end)
+ return set_int(exc, "end", end);
+int PyUnicodeDecodeError_SetEnd(PyObject *exc, int end)
+ return set_int(exc, "end", end);
+int PyUnicodeTranslateError_SetEnd(PyObject *exc, int end)
+ return set_int(exc, "end", end);
+PyObject *PyUnicodeEncodeError_GetReason(PyObject *exc)
+ return get_string(exc, "reason");
+PyObject *PyUnicodeDecodeError_GetReason(PyObject *exc)
+ return get_string(exc, "reason");
+PyObject *PyUnicodeTranslateError_GetReason(PyObject *exc)
+ return get_string(exc, "reason");
+int PyUnicodeEncodeError_SetReason(PyObject *exc, const char *reason)
+ return set_string(exc, "reason", reason);
+int PyUnicodeDecodeError_SetReason(PyObject *exc, const char *reason)
+ return set_string(exc, "reason", reason);
+int PyUnicodeTranslateError_SetReason(PyObject *exc, const char *reason)
+ return set_string(exc, "reason", reason);
+static PyObject *
+UnicodeError__init__(PyObject *self, PyObject *args, PyTypeObject *objecttype)
+ PyObject *rtnval = NULL;
+ PyObject *encoding;
+ PyObject *object;
+ PyObject *start;
+ PyObject *end;
+ PyObject *reason;
+ if (!(self = get_self(args)))
+ return NULL;
+ if (!(args = PySequence_GetSlice(args, 1, PySequence_Size(args))))
+ return NULL;
+ if (!PyArg_ParseTuple(args, "O!O!O!O!O!",
+ &PyString_Type, &encoding,
+ objecttype, &object,
+ &PyInt_Type, &start,
+ &PyInt_Type, &end,
+ &PyString_Type, &reason))
+ return NULL;
+ if (PyObject_SetAttrString(self, "args", args))
+ goto finally;
+ if (PyObject_SetAttrString(self, "encoding", encoding))
+ goto finally;
+ if (PyObject_SetAttrString(self, "object", object))
+ goto finally;
+ if (PyObject_SetAttrString(self, "start", start))
+ goto finally;
+ if (PyObject_SetAttrString(self, "end", end))
+ goto finally;
+ if (PyObject_SetAttrString(self, "reason", reason))
+ goto finally;
+ Py_INCREF(Py_None);
+ rtnval = Py_None;
+ finally:
+ Py_DECREF(args);
+ return rtnval;
+static PyObject *
+UnicodeEncodeError__init__(PyObject *self, PyObject *args)
+ return UnicodeError__init__(self, args, &PyUnicode_Type);
+static PyObject *
+UnicodeEncodeError__str__(PyObject *self, PyObject *arg)
+ PyObject *encodingObj = NULL;
+ PyObject *objectObj = NULL;
+ int length;
+ int start;
+ int end;
+ PyObject *reasonObj = NULL;
+ char buffer[1000];
+ PyObject *result = NULL;
+ self = arg;
+ if (!(encodingObj = PyUnicodeEncodeError_GetEncoding(self)))
+ goto error;
+ if (!(objectObj = PyUnicodeEncodeError_GetObject(self)))
+ goto error;
+ length = PyUnicode_GET_SIZE(objectObj);
+ if (PyUnicodeEncodeError_GetStart(self, &start))
+ goto error;
+ if (PyUnicodeEncodeError_GetEnd(self, &end))
+ goto error;
+ if (!(reasonObj = PyUnicodeEncodeError_GetReason(self)))
+ goto error;
+ if (end==start+1) {
+ PyOS_snprintf(buffer, sizeof(buffer),
+ "'%.400s' codec can't encode character '\\u%x' in position %d: %.400s",
+ PyString_AS_STRING(encodingObj),
+ (int)PyUnicode_AS_UNICODE(objectObj)[start],
+ start,
+ PyString_AS_STRING(reasonObj)
+ );
+ }
+ else {
+ PyOS_snprintf(buffer, sizeof(buffer),
+ "'%.400s' codec can't encode characters in position %d-%d: %.400s",
+ PyString_AS_STRING(encodingObj),
+ start,
+ end-1,
+ PyString_AS_STRING(reasonObj)
+ );
+ }
+ result = PyString_FromString(buffer);
+ Py_XDECREF(reasonObj);
+ Py_XDECREF(objectObj);
+ Py_XDECREF(encodingObj);
+ return result;
+static PyMethodDef UnicodeEncodeError_methods[] = {
+ {"__init__", UnicodeEncodeError__init__, METH_VARARGS},
+ {"__str__", UnicodeEncodeError__str__, METH_O},
+PyObject * PyUnicodeEncodeError_Create(
+ const char *encoding, const Py_UNICODE *object, int length,
+ int start, int end, const char *reason)
+ return PyObject_CallFunction(PyExc_UnicodeEncodeError, "su#iis",
+ encoding, object, length, start, end, reason);
+static PyObject *
+UnicodeDecodeError__init__(PyObject *self, PyObject *args)
+ return UnicodeError__init__(self, args, &PyString_Type);
+static PyObject *
+UnicodeDecodeError__str__(PyObject *self, PyObject *arg)
+ PyObject *encodingObj = NULL;
+ PyObject *objectObj = NULL;
+ int length;
+ int start;
+ int end;
+ PyObject *reasonObj = NULL;
+ char buffer[1000];
+ PyObject *result = NULL;
+ self = arg;
+ if (!(encodingObj = PyUnicodeDecodeError_GetEncoding(self)))
+ goto error;
+ if (!(objectObj = PyUnicodeDecodeError_GetObject(self)))
+ goto error;
+ length = PyString_GET_SIZE(objectObj);
+ if (PyUnicodeDecodeError_GetStart(self, &start))
+ goto error;
+ if (PyUnicodeDecodeError_GetEnd(self, &end))
+ goto error;
+ if (!(reasonObj = PyUnicodeDecodeError_GetReason(self)))
+ goto error;
+ if (end==start+1) {
+ PyOS_snprintf(buffer, sizeof(buffer),
+ "'%.400s' codec can't decode byte 0x%x in position %d: %.400s",
+ PyString_AS_STRING(encodingObj),
+ ((int)PyString_AS_STRING(objectObj)[start])&0xff,
+ start,
+ PyString_AS_STRING(reasonObj)
+ );
+ }
+ else {
+ PyOS_snprintf(buffer, sizeof(buffer),
+ "'%.400s' codec can't decode bytes in position %d-%d: %.400s",
+ PyString_AS_STRING(encodingObj),
+ start,
+ end-1,
+ PyString_AS_STRING(reasonObj)
+ );
+ }
+ result = PyString_FromString(buffer);
+ Py_XDECREF(reasonObj);
+ Py_XDECREF(objectObj);
+ Py_XDECREF(encodingObj);
+ return result;
+static PyMethodDef UnicodeDecodeError_methods[] = {
+ {"__init__", UnicodeDecodeError__init__, METH_VARARGS},
+ {"__str__", UnicodeDecodeError__str__, METH_O},
+PyObject * PyUnicodeDecodeError_Create(
+ const char *encoding, const char *object, int length,
+ int start, int end, const char *reason)
+ return PyObject_CallFunction(PyExc_UnicodeDecodeError, "ss#iis",
+ encoding, object, length, start, end, reason);
+static PyObject *
+UnicodeTranslateError__init__(PyObject *self, PyObject *args)
+ PyObject *rtnval = NULL;
+ PyObject *object;
+ PyObject *start;
+ PyObject *end;
+ PyObject *reason;
+ if (!(self = get_self(args)))
+ return NULL;
+ if (!(args = PySequence_GetSlice(args, 1, PySequence_Size(args))))
+ return NULL;
+ if (!PyArg_ParseTuple(args, "O!O!O!O!",
+ &PyUnicode_Type, &object,
+ &PyInt_Type, &start,
+ &PyInt_Type, &end,
+ &PyString_Type, &reason))
+ goto finally;
+ if (PyObject_SetAttrString(self, "args", args))
+ goto finally;
+ if (PyObject_SetAttrString(self, "object", object))
+ goto finally;
+ if (PyObject_SetAttrString(self, "start", start))
+ goto finally;
+ if (PyObject_SetAttrString(self, "end", end))
+ goto finally;
+ if (PyObject_SetAttrString(self, "reason", reason))
+ goto finally;
+ Py_INCREF(Py_None);
+ rtnval = Py_None;
+ finally:
+ Py_DECREF(args);
+ return rtnval;
+static PyObject *
+UnicodeTranslateError__str__(PyObject *self, PyObject *arg)
+ PyObject *objectObj = NULL;
+ int length;
+ int start;
+ int end;
+ PyObject *reasonObj = NULL;
+ char buffer[1000];
+ PyObject *result = NULL;
+ self = arg;
+ if (!(objectObj = PyUnicodeTranslateError_GetObject(self)))
+ goto error;
+ length = PyUnicode_GET_SIZE(objectObj);
+ if (PyUnicodeTranslateError_GetStart(self, &start))
+ goto error;
+ if (PyUnicodeTranslateError_GetEnd(self, &end))
+ goto error;
+ if (!(reasonObj = PyUnicodeTranslateError_GetReason(self)))
+ goto error;
+ if (end==start+1) {
+ PyOS_snprintf(buffer, sizeof(buffer),
+ "can't translate character '\\u%x' in position %d: %.400s",
+ (int)PyUnicode_AS_UNICODE(objectObj)[start],
+ start,
+ PyString_AS_STRING(reasonObj)
+ );
+ }
+ else {
+ PyOS_snprintf(buffer, sizeof(buffer),
+ "can't translate characters in position %d-%d: %.400s",
+ start,
+ end-1,
+ PyString_AS_STRING(reasonObj)
+ );
+ }
+ result = PyString_FromString(buffer);
+ Py_XDECREF(reasonObj);
+ Py_XDECREF(objectObj);
+ return result;
+static PyMethodDef UnicodeTranslateError_methods[] = {
+ {"__init__", UnicodeTranslateError__init__, METH_VARARGS},
+ {"__str__", UnicodeTranslateError__str__, METH_O},
+PyObject * PyUnicodeTranslateError_Create(
+ const Py_UNICODE *object, int length,
+ int start, int end, const char *reason)
+ return PyObject_CallFunction(PyExc_UnicodeTranslateError, "u#iis",
+ object, length, start, end, reason);
/* Exception doc strings */
PyDoc_STRVAR(UnicodeError__doc__, "Unicode related error.");
+PyDoc_STRVAR(UnicodeEncodeError__doc__, "Unicode encoding error.");
+PyDoc_STRVAR(UnicodeDecodeError__doc__, "Unicode decoding error.");
+PyDoc_STRVAR(UnicodeTranslateError__doc__, "Unicode translation error.");
"Internal error in the Python interpreter.\n\
PyObject *PyExc_SystemExit;
PyObject *PyExc_UnboundLocalError;
PyObject *PyExc_UnicodeError;
+PyObject *PyExc_UnicodeEncodeError;
+PyObject *PyExc_UnicodeDecodeError;
+PyObject *PyExc_UnicodeTranslateError;
PyObject *PyExc_TypeError;
PyObject *PyExc_ValueError;
PyObject *PyExc_ZeroDivisionError;
{"ValueError", &PyExc_ValueError, 0, ValueError__doc__},
{"UnicodeError", &PyExc_UnicodeError, &PyExc_ValueError, UnicodeError__doc__},
+ {"UnicodeEncodeError", &PyExc_UnicodeEncodeError, &PyExc_UnicodeError,
+ UnicodeEncodeError__doc__, UnicodeEncodeError_methods},
+ {"UnicodeDecodeError", &PyExc_UnicodeDecodeError, &PyExc_UnicodeError,
+ UnicodeDecodeError__doc__, UnicodeDecodeError_methods},
+ {"UnicodeTranslateError", &PyExc_UnicodeTranslateError, &PyExc_UnicodeError,
+ UnicodeTranslateError__doc__, UnicodeTranslateError_methods},
{"ReferenceError", &PyExc_ReferenceError, 0, ReferenceError__doc__},
{"SystemError", &PyExc_SystemError, 0, SystemError__doc__},
{"MemoryError", &PyExc_MemoryError, 0, MemoryError__doc__},