self.assertEqual(coder(input), (expect, len(input)))
return check
+
class Queue(object):
"""
queue: write bytes at one end, read bytes from the other end
self._buffer = self._buffer[size:]
return s
+
class MixInCheckStateHandling:
def check_state_handling_decode(self, encoding, u, s):
for i in range(len(s)+1):
part2 = d.encode(u[i:], True)
self.assertEqual(s, part1+part2)
+
class ReadTest(MixInCheckStateHandling):
def check_partial(self, input, partialresults):
# get a StreamReader for the encoding and feed the bytestring version
self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
before + backslashreplace + after)
+
class UTF32Test(ReadTest, unittest.TestCase):
encoding = "utf-32"
if sys.byteorder == 'little':
self.assertEqual('\U00010000' * 1024,
codecs.utf_32_decode(encoded_be)[0])
+
class UTF32LETest(ReadTest, unittest.TestCase):
encoding = "utf-32-le"
ill_formed_sequence = b"\x80\xdc\x00\x00"
self.assertEqual('\U00010000' * 1024,
codecs.utf_32_le_decode(encoded)[0])
+
class UTF32BETest(ReadTest, unittest.TestCase):
encoding = "utf-32-be"
ill_formed_sequence = b"\x00\x00\xdc\x80"
with self.assertRaises(UnicodeDecodeError):
b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
+
@unittest.skipUnless(sys.platform == 'win32',
'cp65001 is a Windows-only codec')
class CP65001Test(ReadTest, unittest.TestCase):
self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
+
class RecodingTest(unittest.TestCase):
def test_recoding(self):
f = io.BytesIO()
if len(i)!=2:
print(repr(i))
+
class PunycodeTest(unittest.TestCase):
def test_encode(self):
for uni, puny in punycode_testcases:
puny = puny.decode("ascii").encode("ascii")
self.assertEqual(uni, puny.decode("punycode"))
+
class UnicodeInternalTest(unittest.TestCase):
@unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
def test_bug1251300(self):
except Exception as e:
raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
+
class IDNACodecTest(unittest.TestCase):
def test_builtin_decode(self):
self.assertEqual(str(b"python.org", "idna"), "python.org")
self.assertRaises(Exception,
b"python.org".decode, "idna", errors)
+
class CodecsModuleTest(unittest.TestCase):
def test_decode(self):
self.assertRaises(UnicodeError,
codecs.decode, b'abc', 'undefined', errors)
+
class StreamReaderTest(unittest.TestCase):
def setUp(self):
f = self.reader(self.stream)
self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
+
class EncodedFileTest(unittest.TestCase):
def test_basic(self):
"unicode_internal"
]
+
class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
def test_basics(self):
s = "abc123" # all codecs should be able to encode these
self.check_state_handling_decode(encoding, u, u.encode(encoding))
self.check_state_handling_encode(encoding, u, u.encode(encoding))
+
class CharmapTest(unittest.TestCase):
def test_decode_with_string_map(self):
self.assertEqual(
info.streamwriter, 'strict') as srw:
self.assertEqual(srw.read(), "\xfc")
+
class TypesTest(unittest.TestCase):
def test_decode_unicode(self):
# Most decoders don't accept unicode input
bytes_transform_encodings.append("bz2_codec")
transform_aliases["bz2_codec"] = ["bz2"]
+
class TransformCodecTest(unittest.TestCase):
def test_basics(self):
self.assertEqual(decoded, ('abc', 3))
+class ASCIITest(unittest.TestCase):
+ def test_decode(self):
+ for data, error_handler, expected in (
+ (b'[\x80\xff]', 'ignore', '[]'),
+ (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
+ (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
+ (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
+ ):
+ with self.subTest(data=data, error_handler=error_handler,
+ expected=expected):
+ self.assertEqual(data.decode('ascii', error_handler),
+ expected)
+
+
if __name__ == "__main__":
unittest.main()
/* --- 7-bit ASCII Codec -------------------------------------------------- */
+typedef enum {
+ _Py_ERROR_UNKNOWN=0,
+ _Py_ERROR_SURROGATEESCAPE,
+ _Py_ERROR_REPLACE,
+ _Py_ERROR_IGNORE,
+ _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+ if (errors == NULL)
+ return _Py_ERROR_OTHER;
+ if (strcmp(errors, "surrogateescape") == 0)
+ return _Py_ERROR_SURROGATEESCAPE;
+ if (strcmp(errors, "ignore") == 0)
+ return _Py_ERROR_IGNORE;
+ if (strcmp(errors, "replace") == 0)
+ return _Py_ERROR_REPLACE;
+ return _Py_ERROR_OTHER;
+}
+
PyObject *
PyUnicode_DecodeASCII(const char *s,
Py_ssize_t size,
Py_ssize_t endinpos;
Py_ssize_t outpos;
const char *e;
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();
PyUnicode_WRITE(kind, data, writer.pos, c);
writer.pos++;
++s;
+ continue;
}
- else {
+
+ /* byte outsize range 0x00..0x7f: call the error handler */
+
+ if (error_handler == _Py_ERROR_UNKNOWN)
+ error_handler = get_error_handler(errors);
+
+ switch (error_handler)
+ {
+ case _Py_ERROR_REPLACE:
+ case _Py_ERROR_SURROGATEESCAPE:
+ /* Fast-path: the error handler only writes one character,
+ but we must switch to UCS2 at the first write */
+ if (kind < PyUnicode_2BYTE_KIND) {
+ if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos,
+ 0xffff) < 0)
+ return NULL;
+ kind = writer.kind;
+ data = writer.data;
+ }
+
+ if (error_handler == _Py_ERROR_REPLACE)
+ PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+ else
+ PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+ writer.pos++;
+ ++s;
+ break;
+
+ case _Py_ERROR_IGNORE:
+ ++s;
+ break;
+
+ default:
startinpos = s-starts;
endinpos = startinpos + 1;
if (unicode_decode_call_errorhandler_writer(
- errors, &errorHandler,
+ errors, &error_handler_obj,
"ascii", "ordinal not in range(128)",
&starts, &e, &startinpos, &endinpos, &exc, &s,
&writer))
data = writer.data;
}
}
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return _PyUnicodeWriter_Finish(&writer);
onError:
_PyUnicodeWriter_Dealloc(&writer);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return NULL;
}