verify(repr(u"'\"") == """u'\\'"'""")
verify(repr(u"'") == '''u"'"''')
verify(repr(u'"') == """u'"'""")
- verify(repr(u''.join(map(unichr, range(256)))) ==
- "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
- "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
- "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
- "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
- "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
- "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
- "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
- "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
- "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
- "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
- "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
- "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
- "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
- "\\xfe\\xff'")
+ latin1repr = (
+ "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
+ "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
+ "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
+ "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
+ "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
+ "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
+ "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
+ "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
+ "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
+ "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
+ "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
+ "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
+ "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
+ "\\xfe\\xff'")
+ testrepr = repr(u''.join(map(unichr, range(256))))
+ verify(testrepr == latin1repr)
def test(method, input, output, *args):
if verbose:
verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
# UTF-8 specific encoding tests:
+verify(u''.encode('utf-8') == '')
verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
verify(unicode(u.encode(encoding),encoding) == u)
-# Roundtrip safety for non-BMP (just a few chars)
-u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
-for encoding in ('utf-8',
- 'utf-16', 'utf-16-le', 'utf-16-be',
- #'raw_unicode_escape',
- 'unicode_escape', 'unicode_internal'):
- verify(unicode(u.encode(encoding),encoding) == u)
-
+# Roundtrip safety for BMP (just the first 256 chars)
u = u''.join(map(unichr, range(256)))
for encoding in (
'latin-1',
except ValueError,why:
print '*** codec for "%s" failed: %s' % (encoding, why)
+# Roundtrip safety for BMP (just the first 128 chars)
u = u''.join(map(unichr, range(128)))
for encoding in (
'ascii',
except ValueError,why:
print '*** codec for "%s" failed: %s' % (encoding, why)
+# Roundtrip safety for non-BMP (just a few chars)
+u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
+for encoding in ('utf-8',
+ 'utf-16', 'utf-16-le', 'utf-16-be',
+ #'raw_unicode_escape',
+ 'unicode_escape', 'unicode_internal'):
+ verify(unicode(u.encode(encoding),encoding) == u)
+
+# UTF-8 must be roundtrip safe for all UCS-2 code points
+u = u''.join(map(unichr, range(0x10000)))
+for encoding in ('utf-8',):
+ verify(unicode(u.encode(encoding),encoding) == u)
+
print 'done.'
print 'Testing standard mapping codecs...',
goto utf8Error;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
- if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
+ if (ch < 0x0800) {
+ /* Note: UTF-8 encodings of surrogates are considered
+ legal UTF-8 sequences;
+
+ XXX For wide builds (UCS-4) we should probably try
+ to recombine the surrogates into a single code
+ unit.
+ */
errmsg = "illegal encoding";
goto utf8Error;
}
else
- *p++ = (Py_UNICODE)ch;
+ *p++ = (Py_UNICODE)ch;
break;
case 4:
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
/* validate and convert to UTF-16 */
if ((ch < 0x10000) /* minimum value allowed for 4
- byte encoding */
+ byte encoding */
|| (ch > 0x10ffff)) /* maximum value allowed for
- UTF-16 */
+ UTF-16 */
{
errmsg = "illegal encoding";
goto utf8Error;
unsigned int cbWritten = 0;
int i = 0;
+ /* Short-cut for emtpy strings */
+ if (size == 0)
+ return PyString_FromStringAndSize(NULL, 0);
+
+ /* We allocate 4 more bytes to have room for at least one full
+ UTF-8 sequence; saves a few cycles in the loop below */
v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
if (v == NULL)
return NULL;
- if (size == 0)
- return v;
p = PyString_AS_STRING(v);
while (i < size) {