Py_buffer info;
char lower[11]; /* Enough for any encoding shortcut */
- if (encoding == NULL)
- encoding = PyUnicode_GetDefaultEncoding();
-
/* Shortcuts for common default encodings */
- if (normalize_encoding(encoding, lower, sizeof(lower))) {
+ if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
- if (strcmp(lower, "utf-8") == 0)
- return PyUnicode_DecodeUTF8(s, size, errors);
+ if ((strcmp(lower, "utf-8") == 0) ||
+ (strcmp(lower, "utf8") == 0))
+ return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
else if ((strcmp(lower, "latin-1") == 0) ||
+ (strcmp(lower, "latin1") == 0) ||
(strcmp(lower, "iso-8859-1") == 0))
return PyUnicode_DecodeLatin1(s, size, errors);
-#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
+#ifdef HAVE_MBCS
else if (strcmp(lower, "mbcs") == 0)
return PyUnicode_DecodeMBCS(s, size, errors);
#endif
return NULL;
}
- if (encoding == NULL)
- encoding = PyUnicode_GetDefaultEncoding();
-
/* Shortcuts for common default encodings */
- if (normalize_encoding(encoding, lower, sizeof(lower))) {
+ if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
- if (strcmp(lower, "utf-8") == 0)
- return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
- PyUnicode_GET_SIZE(unicode),
- errors);
+ if ((strcmp(lower, "utf-8") == 0) ||
+ (strcmp(lower, "utf8") == 0))
+ {
+ if (errors == NULL || strcmp(errors, "strict") == 0)
+ return _PyUnicode_AsUTF8String(unicode, NULL);
+ else
+ return _PyUnicode_AsUTF8String(unicode, errors);
+ }
else if ((strcmp(lower, "latin-1") == 0) ||
+ (strcmp(lower, "latin1") == 0) ||
(strcmp(lower, "iso-8859-1") == 0))
- return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
- PyUnicode_GET_SIZE(unicode),
- errors);
-#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
+ return _PyUnicode_AsLatin1String(unicode, errors);
+#ifdef HAVE_MBCS
else if (strcmp(lower, "mbcs") == 0)
- return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
- PyUnicode_GET_SIZE(unicode),
- errors);
+ return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
#endif
else if (strcmp(lower, "ascii") == 0)
- return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
- PyUnicode_GET_SIZE(unicode),
- errors);
- }
- /* During bootstrap, we may need to find the encodings
- package, to load the file system encoding, and require the
- file system encoding in order to load the encodings
- package.
-
- Break out of this dependency by assuming that the path to
- the encodings module is ASCII-only. XXX could try wcstombs
- instead, if the file system encoding is the locale's
- encoding. */
- if (Py_FileSystemDefaultEncoding &&
- strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
- !PyThreadState_GET()->interp->codecs_initialized)
- return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
- PyUnicode_GET_SIZE(unicode),
- errors);
+ return _PyUnicode_AsASCIIString(unicode, errors);
+ }
/* Encode via the codec registry */
v = PyCodec_Encode(unicode, encoding, errors);
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
#endif
- #ifdef HAVE_STAT
+PyObject *
+_Py_device_encoding(int fd)
+{
+#if defined(MS_WINDOWS) || defined(MS_WIN64)
+ UINT cp;
+#endif
+ if (!_PyVerify_fd(fd) || !isatty(fd)) {
+ Py_RETURN_NONE;
+ }
+#if defined(MS_WINDOWS) || defined(MS_WIN64)
+ if (fd == 0)
+ cp = GetConsoleCP();
+ else if (fd == 1 || fd == 2)
+ cp = GetConsoleOutputCP();
+ else
+ cp = 0;
+ /* GetConsoleCP() and GetConsoleOutputCP() return 0 if the application
+ has no console */
+ if (cp != 0)
+ return PyUnicode_FromFormat("cp%u", (unsigned int)cp);
+#elif defined(CODESET)
+ {
+ char *codeset = nl_langinfo(CODESET);
+ if (codeset != NULL && codeset[0] != 0)
+ return PyUnicode_FromString(codeset);
+ }
+#endif
+ Py_RETURN_NONE;
+}
+
+ #if !defined(__APPLE__) && !defined(MS_WINDOWS)
+ extern int _Py_normalize_encoding(const char *, char *, size_t);
+
+ /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
+ On these operating systems, nl_langinfo(CODESET) announces an alias of the
+ ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
+ ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
+ locale.getpreferredencoding() codec. For example, if command line arguments
+ are decoded by mbstowcs() and encoded back by os.fsencode(), we get a
+ UnicodeEncodeError instead of retrieving the original byte string.
+
+ The workaround is enabled if setlocale(LC_CTYPE, NULL) returns "C",
+ nl_langinfo(CODESET) announces "ascii" (or an alias to ASCII), and at least
+ one byte in range 0x80-0xff can be decoded from the locale encoding. The
+ workaround is also enabled on error, for example if getting the locale
+ failed.
+
+ Values of locale_is_ascii:
+
+ 1: the workaround is used: _Py_wchar2char() uses
+ encode_ascii_surrogateescape() and _Py_char2wchar() uses
+ decode_ascii_surrogateescape()
+ 0: the workaround is not used: _Py_wchar2char() uses wcstombs() and
+ _Py_char2wchar() uses mbstowcs()
+ -1: unknown, need to call check_force_ascii() to get the value
+ */
+ static int force_ascii = -1;
+
+ static int
+ check_force_ascii(void)
+ {
+ char *loc;
+ #if defined(HAVE_LANGINFO_H) && defined(CODESET)
+ char *codeset, **alias;
+ char encoding[100];
+ int is_ascii;
+ unsigned int i;
+ char* ascii_aliases[] = {
+ "ascii",
+ "646",
+ "ansi-x3.4-1968",
+ "ansi-x3-4-1968",
+ "ansi-x3.4-1986",
+ "cp367",
+ "csascii",
+ "ibm367",
+ "iso646-us",
+ "iso-646.irv-1991",
+ "iso-ir-6",
+ "us",
+ "us-ascii",
+ NULL
+ };
+ #endif
+
+ loc = setlocale(LC_CTYPE, NULL);
+ if (loc == NULL)
+ goto error;
+ if (strcmp(loc, "C") != 0) {
+ /* the LC_CTYPE locale is different than C */
+ return 0;
+ }
+
+ #if defined(HAVE_LANGINFO_H) && defined(CODESET)
+ codeset = nl_langinfo(CODESET);
+ if (!codeset || codeset[0] == '\0') {
+ /* CODESET is not set or empty */
+ goto error;
+ }
+ if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding)))
+ goto error;
+
+ is_ascii = 0;
+ for (alias=ascii_aliases; *alias != NULL; alias++) {
+ if (strcmp(encoding, *alias) == 0) {
+ is_ascii = 1;
+ break;
+ }
+ }
+ if (!is_ascii) {
+ /* nl_langinfo(CODESET) is not "ascii" or an alias of ASCII */
+ return 0;
+ }
+
+ for (i=0x80; i<0xff; i++) {
+ unsigned char ch;
+ wchar_t wch;
+ size_t res;
+
+ ch = (unsigned char)i;
+ res = mbstowcs(&wch, (char*)&ch, 1);
+ if (res != (size_t)-1) {
+ /* decoding a non-ASCII character from the locale encoding succeed:
+ the locale encoding is not ASCII, force ASCII */
+ return 1;
+ }
+ }
+ /* None of the bytes in the range 0x80-0xff can be decoded from the locale
+ encoding: the locale encoding is really ASCII */
+ return 0;
+ #else
+ /* nl_langinfo(CODESET) is not available: always force ASCII */
+ return 1;
+ #endif
+
+ error:
+ /* if an error occured, force the ASCII encoding */
+ return 1;
+ }
+
+ static char*
+ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos)
+ {
+ char *result = NULL, *out;
+ size_t len, i;
+ wchar_t ch;
+
+ if (error_pos != NULL)
+ *error_pos = (size_t)-1;
+
+ len = wcslen(text);
+
+ result = PyMem_Malloc(len + 1); /* +1 for NUL byte */
+ if (result == NULL)
+ return NULL;
+
+ out = result;
+ for (i=0; i<len; i++) {
+ ch = text[i];
+
+ if (ch <= 0x7f) {
+ /* ASCII character */
+ *out++ = (char)ch;
+ }
+ else if (0xdc80 <= ch && ch <= 0xdcff) {
+ /* UTF-8b surrogate */
+ *out++ = (char)(ch - 0xdc00);
+ }
+ else {
+ if (error_pos != NULL)
+ *error_pos = i;
+ PyMem_Free(result);
+ return NULL;
+ }
+ }
+ *out = '\0';
+ return result;
+ }
+ #endif /* !defined(__APPLE__) && !defined(MS_WINDOWS) */
+
+ #if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC))
+ static wchar_t*
+ decode_ascii_surrogateescape(const char *arg, size_t *size)
+ {
+ wchar_t *res;
+ unsigned char *in;
+ wchar_t *out;
+
+ res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
+ if (!res)
+ return NULL;
+
+ in = (unsigned char*)arg;
+ out = res;
+ while(*in)
+ if(*in < 128)
+ *out++ = *in++;
+ else
+ *out++ = 0xdc00 + *in++;
+ *out = 0;
+ if (size != NULL)
+ *size = out - res;
+ return res;
+ }
+ #endif
+
/* Decode a byte string from the locale encoding with the
surrogateescape error handler (undecodable bytes are decoded as characters
/* Cannot use C locale for escaping; manually escape as if charset
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
correctly in the locale's charset, which must be an ASCII superset. */
- res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
- if (!res)
+ res = decode_ascii_surrogateescape(arg, size);
+ if (res == NULL)
goto oom;
- in = (unsigned char*)arg;
- out = res;
- while(*in)
- if(*in < 128)
- *out++ = *in++;
- else
- *out++ = 0xdc00 + *in++;
- *out = 0;
#endif /* HAVE_MBRTOWC */
- if (size != NULL)
- *size = out - res;
return res;
oom:
- fprintf(stderr, "out of memory\n");
+ if (size != NULL)
+ *size = (size_t)-1;
return NULL;
#endif /* __APPLE__ */
}