-from test import support
-import unittest
+import _testcapi
import codecs
+import io
import locale
-import sys, _testcapi, io
+import sys
+import unittest
+import warnings
+
+from test import support
+
+if sys.platform == 'win32':
+ VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
+else:
+ VISTA_OR_LATER = False
+
+try:
+ import ctypes
+except ImportError:
+ ctypes = None
+ SIZEOF_WCHAR_T = -1
+else:
+ SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
+ def coding_checker(self, coder):
+ def check(input, expect):
+ self.assertEqual(coder(input), (expect, len(input)))
+ return check
+
class Queue(object):
"""
queue: write bytes at one end, read bytes from the other end
self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
+ self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
+ self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
+
+ self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
+ self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
+
+
+ class UnicodeEscapeTest(unittest.TestCase):
+ def test_empty(self):
+ self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
+ self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
+
+ def test_raw_encode(self):
+ encode = codecs.unicode_escape_encode
+ for b in range(32, 127):
+ if b != b'\\'[0]:
+ self.assertEqual(encode(chr(b)), (bytes([b]), 1))
+
+ def test_raw_decode(self):
+ decode = codecs.unicode_escape_decode
+ for b in range(256):
+ if b != b'\\'[0]:
+ self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
+
+ def test_escape_encode(self):
+ encode = codecs.unicode_escape_encode
+ check = coding_checker(self, encode)
+ check('\t', br'\t')
+ check('\n', br'\n')
+ check('\r', br'\r')
+ check('\\', br'\\')
+ for b in range(32):
+ if chr(b) not in '\t\n\r':
+ check(chr(b), ('\\x%02x' % b).encode())
+ for b in range(127, 256):
+ check(chr(b), ('\\x%02x' % b).encode())
+ check('\u20ac', br'\u20ac')
+ check('\U0001d120', br'\U0001d120')
+
+ def test_escape_decode(self):
+ decode = codecs.unicode_escape_decode
+ check = coding_checker(self, decode)
+ check(b"[\\\n]", "[]")
+ check(br'[\"]', '["]')
+ check(br"[\']", "[']")
+ check(br"[\\]", r"[\]")
+ check(br"[\a]", "[\x07]")
+ check(br"[\b]", "[\x08]")
+ check(br"[\t]", "[\x09]")
+ check(br"[\n]", "[\x0a]")
+ check(br"[\v]", "[\x0b]")
+ check(br"[\f]", "[\x0c]")
+ check(br"[\r]", "[\x0d]")
+ check(br"[\7]", "[\x07]")
+ check(br"[\8]", r"[\8]")
+ check(br"[\78]", "[\x078]")
+ check(br"[\41]", "[!]")
+ check(br"[\418]", "[!8]")
+ check(br"[\101]", "[A]")
+ check(br"[\1010]", "[A0]")
+ check(br"[\x41]", "[A]")
+ check(br"[\x410]", "[A0]")
+ check(br"\u20ac", "\u20ac")
+ check(br"\U0001d120", "\U0001d120")
+ for b in range(256):
+ if b not in b'\n"\'\\abtnvfr01234567xuUN':
+ check(b'\\' + bytes([b]), '\\' + chr(b))
+
+ def test_decode_errors(self):
+ decode = codecs.unicode_escape_decode
+ for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
+ for i in range(d):
+ self.assertRaises(UnicodeDecodeError, decode,
+ b"\\" + c + b"0"*i)
+ self.assertRaises(UnicodeDecodeError, decode,
+ b"[\\" + c + b"0"*i + b"]")
+ data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
+ self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
+ self.assertEqual(decode(data, "replace"),
+ ("[\ufffd]\ufffd", len(data)))
+ self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
+ self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
+ self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
+
+
class SurrogateEscapeTest(unittest.TestCase):
def test_utf8(self):
const char *starts = s;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
- int j;
- Py_ssize_t outpos;
- PyUnicodeObject *v;
- Py_UNICODE *p;
+ PyObject *v;
const char *end;
char* message;
Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
break;
store:
/* when we get here, chr is a 32-bit unicode character */
- if (chr <= MAX_UNICODE) {
- WRITECHAR(chr);
- if (chr <= 0xffff)
- /* UCS-2 character */
- *p++ = (Py_UNICODE) chr;
- else if (chr <= 0x10ffff) {
- /* UCS-4 character. Either store directly, or as
- surrogate pair. */
-#ifdef Py_UNICODE_WIDE
- *p++ = chr;
-#else
- chr -= 0x10000L;
- *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
- *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
-#endif
-- } else {
- endinpos = s-starts;
- if (unicode_decode_call_errorhandler(
- errors, &errorHandler,
- "unicodeescape", "illegal Unicode character",
- &starts, &end, &startinpos, &endinpos, &exc, &s,
- &v, &i))
- goto onError;
- }
- message = "illegal Unicode character";
++ message = "illegal Unicode character";
++ if (chr > MAX_UNICODE)
+ goto error;
- }
++ WRITECHAR(chr);
break;
/* \N{name} */
if (s > end) {
message = "\\ at end of string";
s--;
- endinpos = s-starts;
- if (unicode_decode_call_errorhandler(
- errors, &errorHandler,
- "unicodeescape", message,
- &starts, &end, &startinpos, &endinpos, &exc, &s,
- &v, &i))
- goto onError;
+ goto error;
}
else {
- *p++ = '\\';
- *p++ = (unsigned char)s[-1];
+ WRITECHAR('\\');
+ WRITECHAR((unsigned char)s[-1]);
}
break;
}
- nextByte:
- ;
+ continue;
+
+ error:
+ endinpos = s-starts;
- outpos = p-PyUnicode_AS_UNICODE(v);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "unicodeescape", message,
+ &starts, &end, &startinpos, &endinpos, &exc, &s,
- &v, &outpos, &p))
++ &v, &i))
+ goto onError;
++ len = PyUnicode_GET_LENGTH(v);
+ continue;
}
- if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
+#undef WRITECHAR
+
+ if (unicode_resize(&v, i) < 0)
goto onError;
Py_XDECREF(errorHandler);
Py_XDECREF(exc);