From: Victor Stinner Date: Thu, 2 May 2019 18:56:30 +0000 (-0400) Subject: bpo-36775: _PyCoreConfig only uses wchar_t* (GH-13062) X-Git-Tag: v3.8.0a4~26 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=709d23dee69e700b87d5a4cb59e149d0e1af7993;p=python bpo-36775: _PyCoreConfig only uses wchar_t* (GH-13062) _PyCoreConfig: Change filesystem_encoding, filesystem_errors, stdio_encoding and stdio_errors fields type from char* to wchar_t*. Changes: * PyInterpreterState: replace fscodec_initialized (int) with fs_codec structure. * Add get_error_handler_wide() and unicode_encode_utf8() helper functions. * Add error_handler parameter to unicode_encode_locale() and unicode_decode_locale(). * Remove _PyCoreConfig_SetString(). * Rename _PyCoreConfig_SetWideString() to _PyCoreConfig_SetString(). * Rename _PyCoreConfig_SetWideStringFromString() to _PyCoreConfig_DecodeLocale(). --- diff --git a/Include/cpython/coreconfig.h b/Include/cpython/coreconfig.h index 1aab5e4f0e..5672080b78 100644 --- a/Include/cpython/coreconfig.h +++ b/Include/cpython/coreconfig.h @@ -207,8 +207,8 @@ typedef struct { See Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors. */ - char *filesystem_encoding; - char *filesystem_errors; + wchar_t *filesystem_encoding; + wchar_t *filesystem_errors; wchar_t *pycache_prefix; /* PYTHONPYCACHEPREFIX, -X pycache_prefix=PATH */ wchar_t *program_name; /* Program name, see also Py_GetProgramName() */ @@ -334,13 +334,13 @@ typedef struct { Value set from PYTHONIOENCODING environment variable and Py_SetStandardStreamEncoding() function. See also 'stdio_errors' attribute. */ - char *stdio_encoding; + wchar_t *stdio_encoding; /* Error handler of sys.stdin and sys.stdout. Value set from PYTHONIOENCODING environment variable and Py_SetStandardStreamEncoding() function. See also 'stdio_encoding' attribute. */ - char *stdio_errors; + wchar_t *stdio_errors; #ifdef MS_WINDOWS /* If greater than zero, use io.FileIO instead of WindowsConsoleIO for sys diff --git a/Include/internal/pycore_coreconfig.h b/Include/internal/pycore_coreconfig.h index 8af310d2b0..d48904e482 100644 --- a/Include/internal/pycore_coreconfig.h +++ b/Include/internal/pycore_coreconfig.h @@ -106,12 +106,9 @@ PyAPI_FUNC(_PyInitError) _PyCoreConfig_Copy( _PyCoreConfig *config, const _PyCoreConfig *config2); PyAPI_FUNC(_PyInitError) _PyCoreConfig_SetString( - char **config_str, - const char *str); -PyAPI_FUNC(_PyInitError) _PyCoreConfig_SetWideString( wchar_t **config_str, const wchar_t *str); -PyAPI_FUNC(_PyInitError) _PyCoreConfig_SetWideStringFromString( +PyAPI_FUNC(_PyInitError) _PyCoreConfig_DecodeLocale( wchar_t **config_str, const char *str); PyAPI_FUNC(_PyInitError) _PyCoreConfig_InitPathConfig(_PyCoreConfig *config); diff --git a/Include/internal/pycore_pylifecycle.h b/Include/internal/pycore_pylifecycle.h index a2383d476e..321cc5d278 100644 --- a/Include/internal/pycore_pylifecycle.h +++ b/Include/internal/pycore_pylifecycle.h @@ -21,6 +21,9 @@ extern int _Py_SetFileSystemEncoding( const char *errors); extern void _Py_ClearFileSystemEncoding(void); extern _PyInitError _PyUnicode_InitEncodings(PyInterpreterState *interp); +#ifdef MS_WINDOWS +extern int _PyUnicode_EnableLegacyWindowsFSEncoding(void); +#endif PyAPI_FUNC(void) _Py_ClearStandardStreamEncoding(void); diff --git a/Include/internal/pycore_pystate.h b/Include/internal/pycore_pystate.h index 2c24f679dc..67bcd147e2 100644 --- a/Include/internal/pycore_pystate.h +++ b/Include/internal/pycore_pystate.h @@ -56,7 +56,14 @@ struct _is { PyObject *codec_search_cache; PyObject *codec_error_registry; int codecs_initialized; - int fscodec_initialized; + + /* fs_codec.encoding is initialized to NULL. + Later, it is set to a non-NULL string by _PyUnicode_InitEncodings(). */ + struct { + char *encoding; /* Filesystem encoding (encoded to UTF-8) */ + char *errors; /* Filesystem errors (encoded to UTF-8) */ + _Py_error_handler error_handler; + } fs_codec; _PyCoreConfig core_config; #ifdef HAVE_DLOPEN diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index 0abb4c8abb..8645bc26cf 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -260,6 +260,7 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(utf8_encoder)(PyObject *unicode, STRINGLIB_CHAR *data, Py_ssize_t size, + _Py_error_handler error_handler, const char *errors) { Py_ssize_t i; /* index into data of next input character */ @@ -268,7 +269,6 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, PyObject *error_handler_obj = NULL; PyObject *exc = NULL; PyObject *rep = NULL; - _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; #endif #if STRINGLIB_SIZEOF_CHAR == 1 const Py_ssize_t max_char_size = 2; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5b6b241cb6..4d86519e86 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -40,6 +40,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #define PY_SSIZE_T_CLEAN #include "Python.h" +#include "pycore_coreconfig.h" #include "pycore_fileutils.h" #include "pycore_object.h" #include "pycore_pylifecycle.h" @@ -264,6 +265,13 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value, /* Forward declaration */ static inline int _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); +static PyObject * +unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, + const char *errors); +static PyObject * +unicode_decode_utf8(const char *s, Py_ssize_t size, + _Py_error_handler error_handler, const char *errors, + Py_ssize_t *consumed); /* List of static strings. */ static _Py_Identifier *static_strings = NULL; @@ -388,6 +396,35 @@ _Py_GetErrorHandler(const char *errors) return _Py_ERROR_OTHER; } + +static _Py_error_handler +get_error_handler_wide(const wchar_t *errors) +{ + if (errors == NULL || wcscmp(errors, L"strict") == 0) { + return _Py_ERROR_STRICT; + } + if (wcscmp(errors, L"surrogateescape") == 0) { + return _Py_ERROR_SURROGATEESCAPE; + } + if (wcscmp(errors, L"replace") == 0) { + return _Py_ERROR_REPLACE; + } + if (wcscmp(errors, L"ignore") == 0) { + return _Py_ERROR_IGNORE; + } + if (wcscmp(errors, L"backslashreplace") == 0) { + return _Py_ERROR_BACKSLASHREPLACE; + } + if (wcscmp(errors, L"surrogatepass") == 0) { + return _Py_ERROR_SURROGATEPASS; + } + if (wcscmp(errors, L"xmlcharrefreplace") == 0) { + return _Py_ERROR_XMLCHARREFREPLACE; + } + return _Py_ERROR_OTHER; +} + + /* The max unicode value is always 0x10FFFF while using the PEP-393 API. This function is kept for backward compatibility with the old API. */ Py_UNICODE @@ -3445,11 +3482,9 @@ PyUnicode_AsEncodedObject(PyObject *unicode, static PyObject * -unicode_encode_locale(PyObject *unicode, const char *errors, +unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler, int current_locale) { - _Py_error_handler error_handler = _Py_GetErrorHandler(errors); - Py_ssize_t wlen; wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen); if (wstr == NULL) { @@ -3499,30 +3534,44 @@ unicode_encode_locale(PyObject *unicode, const char *errors, PyObject * PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) { - return unicode_encode_locale(unicode, errors, 1); + _Py_error_handler error_handler = _Py_GetErrorHandler(errors); + return unicode_encode_locale(unicode, error_handler, 1); } PyObject * PyUnicode_EncodeFSDefault(PyObject *unicode) { PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); - const _PyCoreConfig *config = &interp->core_config; #ifdef _Py_FORCE_UTF8_FS_ENCODING - return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors); + if (interp->fs_codec.encoding) { + return unicode_encode_utf8(unicode, + interp->fs_codec.error_handler, + interp->fs_codec.errors); + } + else { + const _PyCoreConfig *config = &interp->core_config; + _Py_error_handler errors; + errors = get_error_handler_wide(config->filesystem_errors); + assert(errors != _Py_ERROR_UNKNOWN); + return unicode_encode_utf8(unicode, errors, NULL); + } #else /* Bootstrap check: if the filesystem codec is implemented in Python, we cannot use it to encode and decode filenames before it is loaded. Load the Python codec requires to encode at least its own filename. Use the C implementation of the locale codec until the codec registry is initialized and the Python codec is loaded. See initfsencoding(). */ - if (interp->fscodec_initialized) { + if (interp->fs_codec.encoding) { return PyUnicode_AsEncodedString(unicode, - config->filesystem_encoding, - config->filesystem_errors); + interp->fs_codec.encoding, + interp->fs_codec.errors); } else { - return unicode_encode_locale(unicode, - config->filesystem_errors, 0); + const _PyCoreConfig *config = &interp->core_config; + _Py_error_handler errors; + errors = get_error_handler_wide(config->filesystem_errors); + assert(errors != _Py_ERROR_UNKNOWN); + return unicode_encode_locale(unicode, errors, 0); } #endif } @@ -3663,11 +3712,9 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode, } static PyObject* -unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, - int current_locale) +unicode_decode_locale(const char *str, Py_ssize_t len, + _Py_error_handler errors, int current_locale) { - _Py_error_handler error_handler = _Py_GetErrorHandler(errors); - if (str[len] != '\0' || (size_t)len != strlen(str)) { PyErr_SetString(PyExc_ValueError, "embedded null byte"); return NULL; @@ -3677,7 +3724,7 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, size_t wlen; const char *reason; int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason, - current_locale, error_handler); + current_locale, errors); if (res != 0) { if (res == -2) { PyObject *exc; @@ -3709,14 +3756,16 @@ PyObject* PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, const char *errors) { - return unicode_decode_locale(str, len, errors, 1); + _Py_error_handler error_handler = _Py_GetErrorHandler(errors); + return unicode_decode_locale(str, len, error_handler, 1); } PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors) { Py_ssize_t size = (Py_ssize_t)strlen(str); - return unicode_decode_locale(str, size, errors, 1); + _Py_error_handler error_handler = _Py_GetErrorHandler(errors); + return unicode_decode_locale(str, size, error_handler, 1); } @@ -3730,23 +3779,36 @@ PyObject* PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) { PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); - const _PyCoreConfig *config = &interp->core_config; #ifdef _Py_FORCE_UTF8_FS_ENCODING - return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL); + if (interp->fs_codec.encoding) { + return unicode_decode_utf8(s, size, + interp->fs_codec.error_handler, + interp->fs_codec.errors, + NULL); + } + else { + const _PyCoreConfig *config = &interp->core_config; + _Py_error_handler errors; + errors = get_error_handler_wide(config->filesystem_errors); + assert(errors != _Py_ERROR_UNKNOWN); + return unicode_decode_utf8(s, size, errors, NULL, NULL); + } #else /* Bootstrap check: if the filesystem codec is implemented in Python, we cannot use it to encode and decode filenames before it is loaded. Load the Python codec requires to encode at least its own filename. Use the C implementation of the locale codec until the codec registry is initialized and the Python codec is loaded. See initfsencoding(). */ - if (interp->fscodec_initialized) { + if (interp->fs_codec.encoding) { return PyUnicode_Decode(s, size, - config->filesystem_encoding, - config->filesystem_errors); + interp->fs_codec.encoding, + interp->fs_codec.errors); } else { - return unicode_decode_locale(s, size, - config->filesystem_errors, 0); + const _PyCoreConfig *config = &interp->core_config; + _Py_error_handler errors; + errors = get_error_handler_wide(config->filesystem_errors); + return unicode_decode_locale(s, size, errors, 0); } #endif } @@ -4810,11 +4872,10 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest) return p - start; } -PyObject * -PyUnicode_DecodeUTF8Stateful(const char *s, - Py_ssize_t size, - const char *errors, - Py_ssize_t *consumed) +static PyObject * +unicode_decode_utf8(const char *s, Py_ssize_t size, + _Py_error_handler error_handler, const char *errors, + Py_ssize_t *consumed) { _PyUnicodeWriter writer; const char *starts = s; @@ -4825,7 +4886,6 @@ PyUnicode_DecodeUTF8Stateful(const char *s, const char *errmsg = ""; PyObject *error_handler_obj = NULL; PyObject *exc = NULL; - _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) { if (consumed) @@ -4948,6 +5008,16 @@ onError: } +PyObject * +PyUnicode_DecodeUTF8Stateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed); +} + + /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is non-zero, use strict error handler otherwise. @@ -5231,8 +5301,9 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, maximum possible needed (4 result bytes per Unicode character), and return the excess memory at the end. */ -PyObject * -_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) +static PyObject * +unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, + const char *errors) { enum PyUnicode_Kind kind; void *data; @@ -5260,14 +5331,21 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) case PyUnicode_1BYTE_KIND: /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ assert(!PyUnicode_IS_ASCII(unicode)); - return ucs1lib_utf8_encoder(unicode, data, size, errors); + return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors); case PyUnicode_2BYTE_KIND: - return ucs2lib_utf8_encoder(unicode, data, size, errors); + return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors); case PyUnicode_4BYTE_KIND: - return ucs4lib_utf8_encoder(unicode, data, size, errors); + return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors); } } +PyObject * +_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) +{ + return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors); +} + + PyObject * PyUnicode_EncodeUTF8(const Py_UNICODE *s, Py_ssize_t size, @@ -15575,12 +15653,35 @@ PyUnicode_AsUnicodeCopy(PyObject *unicode) } -static char* -get_codec_name(const char *encoding) +static int +encode_wstr_utf8(wchar_t *wstr, char **str, const char *name) { - PyObject *codec, *name_obj = NULL; + int res; + res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT); + if (res == -2) { + PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name); + return -1; + } + if (res < 0) { + PyErr_NoMemory(); + return -1; + } + return 0; +} + + +static int +config_get_codec_name(wchar_t **config_encoding) +{ + char *encoding; + if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) { + return -1; + } + + PyObject *name_obj = NULL; + PyObject *codec = _PyCodec_Lookup(encoding); + PyMem_RawFree(encoding); - codec = _PyCodec_Lookup(encoding); if (!codec) goto error; @@ -15590,71 +15691,107 @@ get_codec_name(const char *encoding) goto error; } - const char *name_utf8 = PyUnicode_AsUTF8(name_obj); - if (name_utf8 == NULL) { + wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL); + Py_DECREF(name_obj); + if (wname == NULL) { goto error; } - char *name = _PyMem_RawStrdup(name_utf8); - Py_DECREF(name_obj); - if (name == NULL) { + wchar_t *raw_wname = _PyMem_RawWcsdup(wname); + if (raw_wname == NULL) { + PyMem_Free(wname); PyErr_NoMemory(); - return NULL; + goto error; } - return name; + + PyMem_RawFree(*config_encoding); + *config_encoding = raw_wname; + + PyMem_Free(wname); + return 0; error: Py_XDECREF(codec); Py_XDECREF(name_obj); - return NULL; + return -1; } static _PyInitError init_stdio_encoding(PyInterpreterState *interp) { + /* Update the stdio encoding to the normalized Python codec name. */ _PyCoreConfig *config = &interp->core_config; - - char *codec_name = get_codec_name(config->stdio_encoding); - if (codec_name == NULL) { + if (config_get_codec_name(&config->stdio_encoding) < 0) { return _Py_INIT_ERR("failed to get the Python codec name " "of the stdio encoding"); } - PyMem_RawFree(config->stdio_encoding); - config->stdio_encoding = codec_name; return _Py_INIT_OK(); } -static _PyInitError -init_fs_encoding(PyInterpreterState *interp) +static int +init_fs_codec(PyInterpreterState *interp) { _PyCoreConfig *config = &interp->core_config; - char *encoding = get_codec_name(config->filesystem_encoding); - if (encoding == NULL) { - /* Such error can only occurs in critical situations: no more - memory, import a module of the standard library failed, etc. */ - return _Py_INIT_ERR("failed to get the Python codec " - "of the filesystem encoding"); + _Py_error_handler error_handler; + error_handler = get_error_handler_wide(config->filesystem_errors); + if (error_handler == _Py_ERROR_UNKNOWN) { + PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler"); + return -1; } - /* Update the filesystem encoding to the normalized Python codec name. - For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii" - (Python codec name). */ - PyMem_RawFree(config->filesystem_encoding); - config->filesystem_encoding = encoding; + char *encoding, *errors; + if (encode_wstr_utf8(config->filesystem_encoding, + &encoding, + "filesystem_encoding") < 0) { + return -1; + } + + if (encode_wstr_utf8(config->filesystem_errors, + &errors, + "filesystem_errors") < 0) { + PyMem_RawFree(encoding); + return -1; + } + + PyMem_RawFree(interp->fs_codec.encoding); + interp->fs_codec.encoding = encoding; + PyMem_RawFree(interp->fs_codec.errors); + interp->fs_codec.errors = errors; + interp->fs_codec.error_handler = error_handler; + + /* At this point, PyUnicode_EncodeFSDefault() and + PyUnicode_DecodeFSDefault() can now use the Python codec rather than + the C implementation of the filesystem encoding. */ /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors global configuration variables. */ - if (_Py_SetFileSystemEncoding(config->filesystem_encoding, - config->filesystem_errors) < 0) { - return _Py_INIT_NO_MEMORY(); + if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding, + interp->fs_codec.errors) < 0) { + PyErr_NoMemory(); + return -1; + } + return 0; +} + + +static _PyInitError +init_fs_encoding(PyInterpreterState *interp) +{ + /* Update the filesystem encoding to the normalized Python codec name. + For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii" + (Python codec name). */ + _PyCoreConfig *config = &interp->core_config; + if (config_get_codec_name(&config->filesystem_encoding) < 0) { + return _Py_INIT_ERR("failed to get the Python codec " + "of the filesystem encoding"); } - /* PyUnicode can now use the Python codec rather than C implementation - for the filesystem encoding */ - interp->fscodec_initialized = 1; + if (init_fs_codec(interp) < 0) { + return _Py_INIT_ERR("cannot initialize filesystem codec"); + } return _Py_INIT_OK(); } @@ -15671,6 +15808,33 @@ _PyUnicode_InitEncodings(PyInterpreterState *interp) } +#ifdef MS_WINDOWS +int +_PyUnicode_EnableLegacyWindowsFSEncoding(void) +{ + PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); + _PyCoreConfig *config = &interp->core_config; + + /* Set the filesystem encoding to mbcs/replace (PEP 529) */ + wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs"); + wchar_t *errors = _PyMem_RawWcsdup(L"replace"); + if (encoding == NULL || errors == NULL) { + PyMem_RawFree(encoding); + PyMem_RawFree(errors); + PyErr_NoMemory(); + return -1; + } + + PyMem_RawFree(config->filesystem_encoding); + config->filesystem_encoding = encoding; + PyMem_RawFree(config->filesystem_errors); + config->filesystem_errors = errors; + + return init_fs_codec(interp); +} +#endif + + void _PyUnicode_Fini(void) { @@ -15694,6 +15858,12 @@ _PyUnicode_Fini(void) } _PyUnicode_ClearStaticStrings(); (void)PyUnicode_ClearFreeList(); + + PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); + PyMem_RawFree(interp->fs_codec.encoding); + interp->fs_codec.encoding = NULL; + PyMem_RawFree(interp->fs_codec.errors); + interp->fs_codec.errors = NULL; } diff --git a/Programs/_testembed.c b/Programs/_testembed.c index 6e764e3b6c..2cadf82cb1 100644 --- a/Programs/_testembed.c +++ b/Programs/_testembed.c @@ -488,8 +488,8 @@ static int test_init_from_config(void) Force it to 0 through the config. */ config.legacy_windows_stdio = 0; #endif - config.stdio_encoding = "iso8859-1"; - config.stdio_errors = "replace"; + config.stdio_encoding = L"iso8859-1"; + config.stdio_errors = L"replace"; putenv("PYTHONNOUSERSITE="); Py_NoUserSiteDirectory = 0; diff --git a/Python/coreconfig.c b/Python/coreconfig.c index c40c1f859e..15643be376 100644 --- a/Python/coreconfig.c +++ b/Python/coreconfig.c @@ -523,27 +523,7 @@ _PyCoreConfig_Clear(_PyCoreConfig *config) /* Copy str into *config_str (duplicate the string) */ _PyInitError -_PyCoreConfig_SetString(char **config_str, const char *str) -{ - char *str2; - if (str != NULL) { - str2 = _PyMem_RawStrdup(str); - if (str2 == NULL) { - return _Py_INIT_NO_MEMORY(); - } - } - else { - str2 = NULL; - } - PyMem_RawFree(*config_str); - *config_str = str2; - return _Py_INIT_OK(); -} - - -/* Copy str into *config_str (duplicate the string) */ -_PyInitError -_PyCoreConfig_SetWideString(wchar_t **config_str, const wchar_t *str) +_PyCoreConfig_SetString(wchar_t **config_str, const wchar_t *str) { wchar_t *str2; if (str != NULL) { @@ -563,8 +543,8 @@ _PyCoreConfig_SetWideString(wchar_t **config_str, const wchar_t *str) /* Decode str using Py_DecodeLocale() and set the result into *config_str */ static _PyInitError -_PyCoreConfig_SetWideStringFromStringErr(wchar_t **config_str, const char *str, - const char *decode_err_msg) +_PyCoreConfig_DecodeLocaleErr(wchar_t **config_str, const char *str, + const char *decode_err_msg) { wchar_t *str2; if (str != NULL) { @@ -588,19 +568,17 @@ _PyCoreConfig_SetWideStringFromStringErr(wchar_t **config_str, const char *str, } +#define CONFIG_DECODE_LOCALE(config_str, str, NAME) \ + _PyCoreConfig_DecodeLocaleErr(config_str, str, "cannot decode " NAME) + + _PyInitError -_PyCoreConfig_SetWideStringFromString(wchar_t **config_str, const char *str) +_PyCoreConfig_DecodeLocale(wchar_t **config_str, const char *str) { - return _PyCoreConfig_SetWideStringFromStringErr( - config_str, str, "cannot decode string"); + return CONFIG_DECODE_LOCALE(config_str, str, "string"); } -#define CONFIG_DECODE_LOCALE(config_str, str, NAME) \ - _PyCoreConfig_SetWideStringFromStringErr(config_str, str, \ - "cannot decode " NAME) - - _PyInitError _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2) { @@ -608,16 +586,9 @@ _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2) _PyCoreConfig_Clear(config); #define COPY_ATTR(ATTR) config->ATTR = config2->ATTR -#define COPY_STR_ATTR(ATTR) \ - do { \ - err = _PyCoreConfig_SetString(&config->ATTR, config2->ATTR); \ - if (_Py_INIT_FAILED(err)) { \ - return err; \ - } \ - } while (0) #define COPY_WSTR_ATTR(ATTR) \ do { \ - err = _PyCoreConfig_SetWideString(&config->ATTR, config2->ATTR); \ + err = _PyCoreConfig_SetString(&config->ATTR, config2->ATTR); \ if (_Py_INIT_FAILED(err)) { \ return err; \ } \ @@ -676,10 +647,10 @@ _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2) COPY_ATTR(quiet); COPY_ATTR(user_site_directory); COPY_ATTR(buffered_stdio); - COPY_STR_ATTR(filesystem_encoding); - COPY_STR_ATTR(filesystem_errors); - COPY_STR_ATTR(stdio_encoding); - COPY_STR_ATTR(stdio_errors); + COPY_WSTR_ATTR(filesystem_encoding); + COPY_WSTR_ATTR(filesystem_errors); + COPY_WSTR_ATTR(stdio_encoding); + COPY_WSTR_ATTR(stdio_errors); #ifdef MS_WINDOWS COPY_ATTR(legacy_windows_stdio); #endif @@ -692,7 +663,6 @@ _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2) COPY_ATTR(_init_main); #undef COPY_ATTR -#undef COPY_STR_ATTR #undef COPY_WSTR_ATTR #undef COPY_WSTRLIST return _Py_INIT_OK(); @@ -721,16 +691,10 @@ _PyCoreConfig_AsDict(const _PyCoreConfig *config) goto fail; \ } \ } while (0) -#define FROM_STRING(STR) \ - ((STR != NULL) ? \ - PyUnicode_FromString(STR) \ - : (Py_INCREF(Py_None), Py_None)) #define SET_ITEM_INT(ATTR) \ SET_ITEM(#ATTR, PyLong_FromLong(config->ATTR)) #define SET_ITEM_UINT(ATTR) \ SET_ITEM(#ATTR, PyLong_FromUnsignedLong(config->ATTR)) -#define SET_ITEM_STR(ATTR) \ - SET_ITEM(#ATTR, FROM_STRING(config->ATTR)) #define FROM_WSTRING(STR) \ ((STR != NULL) ? \ PyUnicode_FromWideChar(STR, -1) \ @@ -753,8 +717,8 @@ _PyCoreConfig_AsDict(const _PyCoreConfig *config) SET_ITEM_INT(show_alloc_count); SET_ITEM_INT(dump_refs); SET_ITEM_INT(malloc_stats); - SET_ITEM_STR(filesystem_encoding); - SET_ITEM_STR(filesystem_errors); + SET_ITEM_WSTR(filesystem_encoding); + SET_ITEM_WSTR(filesystem_errors); SET_ITEM_WSTR(pycache_prefix); SET_ITEM_WSTR(program_name); SET_ITEM_WSTRLIST(argv); @@ -783,8 +747,8 @@ _PyCoreConfig_AsDict(const _PyCoreConfig *config) SET_ITEM_INT(quiet); SET_ITEM_INT(user_site_directory); SET_ITEM_INT(buffered_stdio); - SET_ITEM_STR(stdio_encoding); - SET_ITEM_STR(stdio_errors); + SET_ITEM_WSTR(stdio_encoding); + SET_ITEM_WSTR(stdio_errors); #ifdef MS_WINDOWS SET_ITEM_INT(legacy_windows_stdio); #endif @@ -803,12 +767,10 @@ fail: Py_DECREF(dict); return NULL; -#undef FROM_STRING #undef FROM_WSTRING #undef SET_ITEM #undef SET_ITEM_INT #undef SET_ITEM_UINT -#undef SET_ITEM_STR #undef SET_ITEM_WSTR #undef SET_ITEM_WSTRLIST } @@ -845,7 +807,7 @@ _PyCoreConfig_GetEnvDup(const _PyCoreConfig *config, return _Py_INIT_OK(); } - return _PyCoreConfig_SetWideString(dest, var); + return _PyCoreConfig_SetString(dest, var); #else const char *var = getenv(name); if (!var || var[0] == '\0') { @@ -853,7 +815,7 @@ _PyCoreConfig_GetEnvDup(const _PyCoreConfig *config, return _Py_INIT_OK(); } - return _PyCoreConfig_SetWideStringFromStringErr(dest, var, decode_err_msg); + return _PyCoreConfig_DecodeLocaleErr(dest, var, decode_err_msg); #endif } @@ -996,8 +958,7 @@ config_init_program_name(_PyCoreConfig *config) /* Use argv[0] by default, if available */ if (config->program != NULL) { - err = _PyCoreConfig_SetWideString(&config->program_name, - config->program); + err = _PyCoreConfig_SetString(&config->program_name, config->program); if (_Py_INIT_FAILED(err)) { return err; } @@ -1010,7 +971,7 @@ config_init_program_name(_PyCoreConfig *config) #else const wchar_t *default_program_name = L"python3"; #endif - err = _PyCoreConfig_SetWideString(&config->program_name, default_program_name); + err = _PyCoreConfig_SetString(&config->program_name, default_program_name); if (_Py_INIT_FAILED(err)) { return err; } @@ -1025,8 +986,8 @@ config_init_executable(_PyCoreConfig *config) /* If Py_SetProgramFullPath() was called, use its value */ const wchar_t *program_full_path = _Py_path_config.program_full_path; if (program_full_path != NULL) { - _PyInitError err = _PyCoreConfig_SetWideString(&config->executable, - program_full_path); + _PyInitError err = _PyCoreConfig_SetString(&config->executable, + program_full_path); if (_Py_INIT_FAILED(err)) { return err; } @@ -1051,7 +1012,7 @@ config_init_home(_PyCoreConfig *config) /* If Py_SetPythonHome() was called, use its value */ wchar_t *home = _Py_path_config.home; if (home) { - _PyInitError err = _PyCoreConfig_SetWideString(&config->home, home); + _PyInitError err = _PyCoreConfig_SetString(&config->home, home); if (_Py_INIT_FAILED(err)) { return err; } @@ -1280,7 +1241,7 @@ config_read_complex_options(_PyCoreConfig *config) } -static const char * +static const wchar_t * config_get_stdio_errors(const _PyCoreConfig *config) { #ifndef MS_WINDOWS @@ -1288,43 +1249,44 @@ config_get_stdio_errors(const _PyCoreConfig *config) if (loc != NULL) { /* surrogateescape is the default in the legacy C and POSIX locales */ if (strcmp(loc, "C") == 0 || strcmp(loc, "POSIX") == 0) { - return "surrogateescape"; + return L"surrogateescape"; } #ifdef PY_COERCE_C_LOCALE /* surrogateescape is the default in locale coercion target locales */ if (_Py_IsLocaleCoercionTarget(loc)) { - return "surrogateescape"; + return L"surrogateescape"; } #endif } - return "strict"; + return L"strict"; #else /* On Windows, always use surrogateescape by default */ - return "surrogateescape"; + return L"surrogateescape"; #endif } static _PyInitError -config_get_locale_encoding(char **locale_encoding) +config_get_locale_encoding(wchar_t **locale_encoding) { #ifdef MS_WINDOWS char encoding[20]; PyOS_snprintf(encoding, sizeof(encoding), "cp%u", GetACP()); + return _PyCoreConfig_DecodeLocale(locale_encoding, encoding); #elif defined(_Py_FORCE_UTF8_LOCALE) - const char *encoding = "UTF-8"; + return _PyCoreConfig_SetString(locale_encoding, L"utf-8"); #else const char *encoding = nl_langinfo(CODESET); if (!encoding || encoding[0] == '\0') { return _Py_INIT_ERR("failed to get the locale encoding: " "nl_langinfo(CODESET) failed"); } + /* nl_langinfo(CODESET) is decoded by Py_DecodeLocale() */ + return CONFIG_DECODE_LOCALE(locale_encoding, encoding, + "nl_langinfo(CODESET)"); #endif - - assert(*locale_encoding == NULL); - return _PyCoreConfig_SetString(locale_encoding, encoding); } @@ -1337,16 +1299,18 @@ config_init_stdio_encoding(_PyCoreConfig *config, /* If Py_SetStandardStreamEncoding() have been called, use these parameters. */ if (config->stdio_encoding == NULL && _Py_StandardStreamEncoding != NULL) { - err = _PyCoreConfig_SetString(&config->stdio_encoding, - _Py_StandardStreamEncoding); + err = CONFIG_DECODE_LOCALE(&config->stdio_encoding, + _Py_StandardStreamEncoding, + "_Py_StandardStreamEncoding"); if (_Py_INIT_FAILED(err)) { return err; } } if (config->stdio_errors == NULL && _Py_StandardStreamErrors != NULL) { - err = _PyCoreConfig_SetString(&config->stdio_errors, - _Py_StandardStreamErrors); + err = CONFIG_DECODE_LOCALE(&config->stdio_errors, + _Py_StandardStreamErrors, + "_Py_StandardStreamErrors"); if (_Py_INIT_FAILED(err)) { return err; } @@ -1359,11 +1323,9 @@ config_init_stdio_encoding(_PyCoreConfig *config, /* PYTHONIOENCODING environment variable */ const char *opt = _PyCoreConfig_GetEnv(config, "PYTHONIOENCODING"); if (opt) { - /* _PyCoreConfig_SetString() requires dest to be initialized to NULL */ - char *pythonioencoding = NULL; - err = _PyCoreConfig_SetString(&pythonioencoding, opt); - if (_Py_INIT_FAILED(err)) { - return err; + char *pythonioencoding = _PyMem_RawStrdup(opt); + if (pythonioencoding == NULL) { + return _Py_INIT_NO_MEMORY(); } char *errors = strchr(pythonioencoding, ':'); @@ -1378,8 +1340,9 @@ config_init_stdio_encoding(_PyCoreConfig *config, /* Does PYTHONIOENCODING contain an encoding? */ if (pythonioencoding[0]) { if (config->stdio_encoding == NULL) { - err = _PyCoreConfig_SetString(&config->stdio_encoding, - pythonioencoding); + err = CONFIG_DECODE_LOCALE(&config->stdio_encoding, + pythonioencoding, + "PYTHONIOENCODING environment variable"); if (_Py_INIT_FAILED(err)) { PyMem_RawFree(pythonioencoding); return err; @@ -1396,7 +1359,9 @@ config_init_stdio_encoding(_PyCoreConfig *config, } if (config->stdio_errors == NULL && errors != NULL) { - err = _PyCoreConfig_SetString(&config->stdio_errors, errors); + err = CONFIG_DECODE_LOCALE(&config->stdio_errors, + errors, + "PYTHONIOENCODING environment variable"); if (_Py_INIT_FAILED(err)) { PyMem_RawFree(pythonioencoding); return err; @@ -1409,15 +1374,14 @@ config_init_stdio_encoding(_PyCoreConfig *config, /* UTF-8 Mode uses UTF-8/surrogateescape */ if (preconfig->utf8_mode) { if (config->stdio_encoding == NULL) { - err = _PyCoreConfig_SetString(&config->stdio_encoding, - "utf-8"); + err = _PyCoreConfig_SetString(&config->stdio_encoding, L"utf-8"); if (_Py_INIT_FAILED(err)) { return err; } } if (config->stdio_errors == NULL) { err = _PyCoreConfig_SetString(&config->stdio_errors, - "surrogateescape"); + L"surrogateescape"); if (_Py_INIT_FAILED(err)) { return err; } @@ -1432,7 +1396,7 @@ config_init_stdio_encoding(_PyCoreConfig *config, } } if (config->stdio_errors == NULL) { - const char *errors = config_get_stdio_errors(config); + const wchar_t *errors = config_get_stdio_errors(config); assert(errors != NULL); err = _PyCoreConfig_SetString(&config->stdio_errors, errors); @@ -1452,33 +1416,32 @@ config_init_fs_encoding(_PyCoreConfig *config, const _PyPreConfig *preconfig) if (config->filesystem_encoding == NULL) { #ifdef _Py_FORCE_UTF8_FS_ENCODING - err = _PyCoreConfig_SetString(&config->filesystem_encoding, - "utf-8"); + err = _PyCoreConfig_SetString(&config->filesystem_encoding, L"utf-8"); #else #ifdef MS_WINDOWS if (preconfig->legacy_windows_fs_encoding) { /* Legacy Windows filesystem encoding: mbcs/replace */ err = _PyCoreConfig_SetString(&config->filesystem_encoding, - "mbcs"); + L"mbcs"); } else #endif if (preconfig->utf8_mode) { err = _PyCoreConfig_SetString(&config->filesystem_encoding, - "utf-8"); + L"utf-8"); } #ifndef MS_WINDOWS else if (_Py_GetForceASCII()) { err = _PyCoreConfig_SetString(&config->filesystem_encoding, - "ascii"); + L"ascii"); } #endif else { #ifdef MS_WINDOWS /* Windows defaults to utf-8/surrogatepass (PEP 529). */ err = _PyCoreConfig_SetString(&config->filesystem_encoding, - "utf-8"); + L"utf-8"); #else err = config_get_locale_encoding(&config->filesystem_encoding); #endif @@ -1491,16 +1454,16 @@ config_init_fs_encoding(_PyCoreConfig *config, const _PyPreConfig *preconfig) } if (config->filesystem_errors == NULL) { - const char *errors; + const wchar_t *errors; #ifdef MS_WINDOWS if (preconfig->legacy_windows_fs_encoding) { - errors = "replace"; + errors = L"replace"; } else { - errors = "surrogatepass"; + errors = L"surrogatepass"; } #else - errors = "surrogateescape"; + errors = L"surrogateescape"; #endif err = _PyCoreConfig_SetString(&config->filesystem_errors, errors); if (_Py_INIT_FAILED(err)) { @@ -1745,8 +1708,8 @@ config_parse_cmdline(_PyCoreConfig *config, _PyPreCmdline *precmdline, || wcscmp(_PyOS_optarg, L"never") == 0 || wcscmp(_PyOS_optarg, L"default") == 0) { - err = _PyCoreConfig_SetWideString(&config->check_hash_pycs_mode, - _PyOS_optarg); + err = _PyCoreConfig_SetString(&config->check_hash_pycs_mode, + _PyOS_optarg); if (_Py_INIT_FAILED(err)) { return err; } @@ -2119,7 +2082,7 @@ config_read_cmdline(_PyCoreConfig *config, _PyPreCmdline *precmdline) } if (config->check_hash_pycs_mode == NULL) { - err = _PyCoreConfig_SetWideString(&config->check_hash_pycs_mode, L"default"); + err = _PyCoreConfig_SetString(&config->check_hash_pycs_mode, L"default"); if (_Py_INIT_FAILED(err)) { goto done; } diff --git a/Python/preconfig.c b/Python/preconfig.c index 108cbc6660..48b9e8383a 100644 --- a/Python/preconfig.c +++ b/Python/preconfig.c @@ -14,7 +14,10 @@ /* --- File system encoding/errors -------------------------------- */ /* The filesystem encoding is chosen by config_init_fs_encoding(), - see also initfsencoding(). */ + see also initfsencoding(). + + Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors + are encoded to UTF-8. */ const char *Py_FileSystemDefaultEncoding = NULL; int Py_HasFileSystemDefaultEncoding = 0; const char *Py_FileSystemDefaultEncodeErrors = NULL; diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 01ef027b9d..2a633cf1cf 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1668,7 +1668,7 @@ is_valid_fd(int fd) static PyObject* create_stdio(const _PyCoreConfig *config, PyObject* io, int fd, int write_mode, const char* name, - const char* encoding, const char* errors) + const wchar_t* encoding, const wchar_t* errors) { PyObject *buf = NULL, *stream = NULL, *text = NULL, *raw = NULL, *res; const char* mode; @@ -1718,7 +1718,7 @@ create_stdio(const _PyCoreConfig *config, PyObject* io, #ifdef MS_WINDOWS /* Windows console IO is always UTF-8 encoded */ if (PyWindowsConsoleIO_Check(raw)) - encoding = "utf-8"; + encoding = L"utf-8"; #endif text = PyUnicode_FromString(name); @@ -1754,10 +1754,25 @@ create_stdio(const _PyCoreConfig *config, PyObject* io, newline = "\n"; #endif - stream = _PyObject_CallMethodId(io, &PyId_TextIOWrapper, "OsssOO", - buf, encoding, errors, + PyObject *encoding_str = PyUnicode_FromWideChar(encoding, -1); + if (encoding_str == NULL) { + Py_CLEAR(buf); + goto error; + } + + PyObject *errors_str = PyUnicode_FromWideChar(errors, -1); + if (errors_str == NULL) { + Py_CLEAR(buf); + Py_CLEAR(encoding_str); + goto error; + } + + stream = _PyObject_CallMethodId(io, &PyId_TextIOWrapper, "OOOsOO", + buf, encoding_str, errors_str, newline, line_buffering, write_through); Py_CLEAR(buf); + Py_CLEAR(encoding_str); + Py_CLEAR(errors_str); if (stream == NULL) goto error; @@ -1874,7 +1889,7 @@ init_sys_streams(PyInterpreterState *interp) fd = fileno(stderr); std = create_stdio(config, iomod, fd, 1, "", config->stdio_encoding, - "backslashreplace"); + L"backslashreplace"); if (std == NULL) goto error; diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 0f7af2c69d..fbdeb9b556 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -424,7 +424,7 @@ sys_getfilesystemencoding_impl(PyObject *module) { PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); const _PyCoreConfig *config = &interp->core_config; - return PyUnicode_FromString(config->filesystem_encoding); + return PyUnicode_FromWideChar(config->filesystem_encoding, -1); } /*[clinic input] @@ -439,7 +439,7 @@ sys_getfilesystemencodeerrors_impl(PyObject *module) { PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); const _PyCoreConfig *config = &interp->core_config; - return PyUnicode_FromString(config->filesystem_errors); + return PyUnicode_FromWideChar(config->filesystem_errors, -1); } /*[clinic input] @@ -1211,30 +1211,9 @@ static PyObject * sys__enablelegacywindowsfsencoding_impl(PyObject *module) /*[clinic end generated code: output=f5c3855b45e24fe9 input=2bfa931a20704492]*/ { - PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); - _PyCoreConfig *config = &interp->core_config; - - /* Set the filesystem encoding to mbcs/replace (PEP 529) */ - char *encoding = _PyMem_RawStrdup("mbcs"); - char *errors = _PyMem_RawStrdup("replace"); - if (encoding == NULL || errors == NULL) { - PyMem_Free(encoding); - PyMem_Free(errors); - PyErr_NoMemory(); - return NULL; - } - - PyMem_RawFree(config->filesystem_encoding); - config->filesystem_encoding = encoding; - PyMem_RawFree(config->filesystem_errors); - config->filesystem_errors = errors; - - if (_Py_SetFileSystemEncoding(config->filesystem_encoding, - config->filesystem_errors) < 0) { - PyErr_NoMemory(); + if (_PyUnicode_EnableLegacyWindowsFSEncoding() < 0) { return NULL; } - Py_RETURN_NONE; }