From: Victor Stinner Date: Mon, 15 Jan 2018 22:43:24 +0000 (+0100) Subject: [3.6] bpo-32555: Fix locale encodings (#5193) X-Git-Tag: v3.6.5rc1~155 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b92c159efada05b3a5ff9d0dbce3fcb2334631f6;p=python [3.6] bpo-32555: Fix locale encodings (#5193) On FreeBSD and Solaris, os.strerror() now always decode the byte string from the current locale encoding, rather than using ASCII/surrogateescape in some cases. Changes: * Add _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() which has an additional current_locale parameter. * PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize() and * PyUnicode_EncodeLocale() now always use the current locale * encoding, instead of using Py_DecodeLocale()/Py_EncodeLocale(). * Document encoding in Py_DecodeLocale() and Py_EncodeLocale() documentations. * Add USE_FORCE_ASCII define to not define decode_ascii_surrogateescape() on Android. --- diff --git a/Doc/c-api/sys.rst b/Doc/c-api/sys.rst index 035cdc1682..48e2b2bb5e 100644 --- a/Doc/c-api/sys.rst +++ b/Doc/c-api/sys.rst @@ -66,9 +66,18 @@ Operating System Utilities surrogate character, escape the bytes using the surrogateescape error handler instead of decoding them. + Encoding, highest priority to lowest priority: + + * ``UTF-8`` on macOS and Android; + * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``, + ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias), + and :c:func:`mbstowcs` and :c:func:`wcstombs` functions use the + ``ISO-8859-1`` encoding. + * the current locale encoding (``LC_CTYPE`` locale). + Return a pointer to a newly allocated wide character string, use :c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write - the number of wide characters excluding the null character into ``*size`` + the number of wide characters excluding the null character into ``*size``. Return ``NULL`` on decoding error or memory allocation error. If *size* is not ``NULL``, ``*size`` is set to ``(size_t)-1`` on memory error or set to @@ -94,6 +103,15 @@ Operating System Utilities :ref:`surrogateescape error handler `: surrogate characters in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF. + Encoding, highest priority to lowest priority: + + * ``UTF-8`` on macOS and Android; + * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``, + ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias), + and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the + ``ISO-8859-1`` encoding. + * the current locale encoding. + Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free` to free the memory. Return ``NULL`` on encoding error or memory allocation error diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 6e91576ee8..b9acaec949 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -773,6 +773,12 @@ system. .. versionadded:: 3.3 + .. versionchanged:: 3.6.5 + The function now also uses the current locale encoding for the + ``surrogateescape`` error handler. Previously, :c:func:`Py_DecodeLocale` + was used for the ``surrogateescape``, and the current locale encoding was + used for ``strict``. + .. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors) @@ -800,6 +806,12 @@ system. .. versionadded:: 3.3 + .. versionchanged:: 3.6.5 + The function now also uses the current locale encoding for the + ``surrogateescape`` error handler. Previously, :c:func:`Py_EncodeLocale` + was used for the ``surrogateescape``, and the current locale encoding was + used for ``strict``. + File System Encoding """""""""""""""""""" diff --git a/Include/fileutils.h b/Include/fileutils.h index 875715df97..8fa70baa21 100644 --- a/Include/fileutils.h +++ b/Include/fileutils.h @@ -17,6 +17,16 @@ PyAPI_FUNC(char*) Py_EncodeLocale( #ifndef Py_LIMITED_API +PyAPI_FUNC(wchar_t *) _Py_DecodeLocaleEx( + const char *arg, + size_t *size, + int current_locale); + +PyAPI_FUNC(char*) _Py_EncodeLocaleEx( + const wchar_t *text, + size_t *error_pos, + int current_locale); + PyAPI_FUNC(PyObject *) _Py_device_encoding(int); #ifdef MS_WINDOWS diff --git a/Misc/NEWS.d/next/Library/2018-01-15-17-52-47.bpo-32555.CMq2zF.rst b/Misc/NEWS.d/next/Library/2018-01-15-17-52-47.bpo-32555.CMq2zF.rst new file mode 100644 index 0000000000..054f8cac1f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-01-15-17-52-47.bpo-32555.CMq2zF.rst @@ -0,0 +1,3 @@ +On FreeBSD and Solaris, os.strerror() now always decode the byte string from +the current locale encoding, rather than using ASCII/surrogateescape in some +cases. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 64905e84b1..86cac96cd6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3439,8 +3439,9 @@ locale_error_handler(const char *errors, int *surrogateescape) } } -PyObject * -PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) +static PyObject * +unicode_encode_locale(PyObject *unicode, const char *errors, + int current_locale) { Py_ssize_t wlen, wlen2; wchar_t *wstr; @@ -3469,7 +3470,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) /* "surrogateescape" error handler */ char *str; - str = Py_EncodeLocale(wstr, &error_pos); + str = _Py_EncodeLocaleEx(wstr, &error_pos, current_locale); if (str == NULL) { if (error_pos == (size_t)-1) { PyErr_NoMemory(); @@ -3549,6 +3550,12 @@ encode_error: return NULL; } +PyObject * +PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) +{ + return unicode_encode_locale(unicode, errors, 1); +} + PyObject * PyUnicode_EncodeFSDefault(PyObject *unicode) { @@ -3571,7 +3578,8 @@ PyUnicode_EncodeFSDefault(PyObject *unicode) Py_FileSystemDefaultEncodeErrors); } else { - return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors); + return unicode_encode_locale(unicode, + Py_FileSystemDefaultEncodeErrors, 0); } #endif } @@ -3741,9 +3749,9 @@ mbstowcs_errorpos(const char *str, size_t len) return 0; } -PyObject* -PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, - const char *errors) +static PyObject* +unicode_decode_locale(const char *str, Py_ssize_t len, + const char *errors, int current_locale) { wchar_t smallbuf[256]; size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); @@ -3766,7 +3774,7 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, if (surrogateescape) { /* "surrogateescape" error handler */ - wstr = Py_DecodeLocale(str, &wlen); + wstr = _Py_DecodeLocaleEx(str, &wlen, current_locale); if (wstr == NULL) { if (wlen == (size_t)-1) PyErr_NoMemory(); @@ -3844,11 +3852,18 @@ decode_error: return NULL; } +PyObject* +PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t size, + const char *errors) +{ + return unicode_decode_locale(str, size, errors, 1); +} + PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors) { Py_ssize_t size = (Py_ssize_t)strlen(str); - return PyUnicode_DecodeLocaleAndSize(str, size, errors); + return unicode_decode_locale(str, size, errors, 1); } @@ -3880,7 +3895,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) Py_FileSystemDefaultEncodeErrors); } else { - return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors); + return unicode_decode_locale(s, size, + Py_FileSystemDefaultEncodeErrors, 0); } #endif } diff --git a/Python/fileutils.c b/Python/fileutils.c index 14dd81b03f..7b87b72f73 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -70,7 +70,10 @@ _Py_device_encoding(int fd) Py_RETURN_NONE; } -#if !defined(__APPLE__) && !defined(MS_WINDOWS) +#if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) + +#define USE_FORCE_ASCII + extern int _Py_normalize_encoding(const char *, char *, size_t); /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale. @@ -221,7 +224,7 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos) } #endif /* !defined(__APPLE__) && !defined(MS_WINDOWS) */ -#if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC)) +#if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII) static wchar_t* decode_ascii_surrogateescape(const char *arg, size_t *size) { @@ -251,39 +254,9 @@ decode_ascii_surrogateescape(const char *arg, size_t *size) #endif -/* Decode a byte string from the locale encoding with the - surrogateescape error handler: undecodable bytes are decoded as characters - in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate - character, escape the bytes using the surrogateescape error handler instead - of decoding them. - - Return a pointer to a newly allocated wide character string, use - PyMem_RawFree() to free the memory. If size is not NULL, write the number of - wide characters excluding the null character into *size - - Return NULL on decoding error or memory allocation error. If *size* is not - NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on - decoding error. - - Decoding errors should never happen, unless there is a bug in the C - library. - - Use the Py_EncodeLocale() function to encode the character string back to a - byte string. */ -wchar_t* -Py_DecodeLocale(const char* arg, size_t *size) +static wchar_t* +decode_current_locale(const char* arg, size_t *size) { -#if defined(__APPLE__) || defined(__ANDROID__) - wchar_t *wstr; - wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg)); - if (size != NULL) { - if (wstr != NULL) - *size = wcslen(wstr); - else - *size = (size_t)-1; - } - return wstr; -#else wchar_t *res; size_t argsize; size_t count; @@ -293,19 +266,6 @@ Py_DecodeLocale(const char* arg, size_t *size) mbstate_t mbs; #endif -#ifndef MS_WINDOWS - if (force_ascii == -1) - force_ascii = check_force_ascii(); - - if (force_ascii) { - /* force ASCII encoding to workaround mbstowcs() issue */ - res = decode_ascii_surrogateescape(arg, size); - if (res == NULL) - goto oom; - return res; - } -#endif - #ifdef HAVE_BROKEN_MBSTOWCS /* Some platforms have a broken implementation of * mbstowcs which does not count the characters that @@ -402,72 +362,96 @@ Py_DecodeLocale(const char* arg, size_t *size) goto oom; #endif /* HAVE_MBRTOWC */ return res; + oom: if (size != NULL) *size = (size_t)-1; return NULL; +} + + +static wchar_t* +decode_locale(const char* arg, size_t *size, int current_locale) +{ + if (current_locale) { + return decode_current_locale(arg, size); + } + +#if defined(__APPLE__) || defined(__ANDROID__) + wchar_t *wstr; + wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg)); + if (size != NULL) { + if (wstr != NULL) + *size = wcslen(wstr); + else + *size = (size_t)-1; + } + return wstr; +#else + +#ifdef USE_FORCE_ASCII + if (force_ascii == -1) { + force_ascii = check_force_ascii(); + } + + if (force_ascii) { + /* force ASCII encoding to workaround mbstowcs() issue */ + wchar_t *res = decode_ascii_surrogateescape(arg, size); + if (res == NULL) { + if (size != NULL) + *size = (size_t)-1; + return NULL; + } + return res; + } +#endif + + return decode_current_locale(arg, size); #endif /* __APPLE__ or __ANDROID__ */ } -/* Encode a wide character string to the locale encoding with the - surrogateescape error handler: surrogate characters in the range - U+DC80..U+DCFF are converted to bytes 0x80..0xFF. - Return a pointer to a newly allocated byte string, use PyMem_Free() to free - the memory. Return NULL on encoding or memory allocation error. +/* Decode a byte string from the locale encoding with the + surrogateescape error handler: undecodable bytes are decoded as characters + in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate + character, escape the bytes using the surrogateescape error handler instead + of decoding them. - If error_pos is not NULL, *error_pos is set to the index of the invalid - character on encoding error, or set to (size_t)-1 otherwise. + Return a pointer to a newly allocated wide character string, use + PyMem_RawFree() to free the memory. If size is not NULL, write the number of + wide characters excluding the null character into *size - Use the Py_DecodeLocale() function to decode the bytes string back to a wide - character string. */ -char* -Py_EncodeLocale(const wchar_t *text, size_t *error_pos) + Return NULL on decoding error or memory allocation error. If *size* is not + NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on + decoding error. + + Decoding errors should never happen, unless there is a bug in the C + library. + + Use the Py_EncodeLocale() function to encode the character string back to a + byte string. */ +wchar_t* +Py_DecodeLocale(const char* arg, size_t *size) { -#if defined(__APPLE__) || defined(__ANDROID__) - Py_ssize_t len; - PyObject *unicode, *bytes = NULL; - char *cpath; + return decode_locale(arg, size, 0); +} - unicode = PyUnicode_FromWideChar(text, wcslen(text)); - if (unicode == NULL) - return NULL; - bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape"); - Py_DECREF(unicode); - if (bytes == NULL) { - PyErr_Clear(); - if (error_pos != NULL) - *error_pos = (size_t)-1; - return NULL; - } +wchar_t* +_Py_DecodeLocaleEx(const char* arg, size_t *size, int current_locale) +{ + return decode_locale(arg, size, current_locale); +} - len = PyBytes_GET_SIZE(bytes); - cpath = PyMem_Malloc(len+1); - if (cpath == NULL) { - PyErr_Clear(); - Py_DECREF(bytes); - if (error_pos != NULL) - *error_pos = (size_t)-1; - return NULL; - } - memcpy(cpath, PyBytes_AsString(bytes), len + 1); - Py_DECREF(bytes); - return cpath; -#else /* __APPLE__ */ + +static char* +encode_current_locale(const wchar_t *text, size_t *error_pos) +{ const size_t len = wcslen(text); char *result = NULL, *bytes = NULL; size_t i, size, converted; wchar_t c, buf[2]; -#ifndef MS_WINDOWS - if (force_ascii == -1) - force_ascii = check_force_ascii(); - - if (force_ascii) - return encode_ascii_surrogateescape(text, error_pos); -#endif - /* The function works in two steps: 1. compute the length of the output buffer in bytes (size) 2. outputs the bytes */ @@ -522,10 +506,89 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos) bytes = result; } return result; +} + + +static char* +encode_locale(const wchar_t *text, size_t *error_pos, int current_locale) +{ + if (current_locale) { + return encode_current_locale(text, error_pos); + } + +#if defined(__APPLE__) || defined(__ANDROID__) + Py_ssize_t len; + PyObject *unicode, *bytes = NULL; + char *cpath; + + unicode = PyUnicode_FromWideChar(text, wcslen(text)); + if (unicode == NULL) + return NULL; + + bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape"); + Py_DECREF(unicode); + if (bytes == NULL) { + PyErr_Clear(); + if (error_pos != NULL) + *error_pos = (size_t)-1; + return NULL; + } + + len = PyBytes_GET_SIZE(bytes); + cpath = PyMem_Malloc(len+1); + if (cpath == NULL) { + PyErr_Clear(); + Py_DECREF(bytes); + if (error_pos != NULL) + *error_pos = (size_t)-1; + return NULL; + } + memcpy(cpath, PyBytes_AsString(bytes), len + 1); + Py_DECREF(bytes); + return cpath; +#else /* __APPLE__ */ + +#ifdef USE_FORCE_ASCII + if (force_ascii == -1) { + force_ascii = check_force_ascii(); + } + + if (force_ascii) { + return encode_ascii_surrogateescape(text, error_pos); + } +#endif + + return encode_current_locale(text, error_pos); #endif /* __APPLE__ or __ANDROID__ */ } +/* Encode a wide character string to the locale encoding with the + surrogateescape error handler: surrogate characters in the range + U+DC80..U+DCFF are converted to bytes 0x80..0xFF. + + Return a pointer to a newly allocated byte string, use PyMem_Free() to free + the memory. Return NULL on encoding or memory allocation error. + + If error_pos is not NULL, *error_pos is set to the index of the invalid + character on encoding error, or set to (size_t)-1 otherwise. + + Use the Py_DecodeLocale() function to decode the bytes string back to a wide + character string. */ +char* +Py_EncodeLocale(const wchar_t *text, size_t *error_pos) +{ + return encode_locale(text, error_pos, 0); +} + + +char* +_Py_EncodeLocaleEx(const wchar_t *text, size_t *error_pos, int current_locale) +{ + return encode_locale(text, error_pos, current_locale); +} + + #ifdef MS_WINDOWS static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */