[3.6] bpo-32555: Fix locale encodings (#5193)

author Victor Stinner <victor.stinner@gmail.com>

Mon, 15 Jan 2018 22:43:24 +0000 (23:43 +0100)

committer GitHub <noreply@github.com>

Mon, 15 Jan 2018 22:43:24 +0000 (23:43 +0100)
author Victor Stinner <victor.stinner@gmail.com>
Mon, 15 Jan 2018 22:43:24 +0000 (23:43 +0100)
committer GitHub <noreply@github.com>
Mon, 15 Jan 2018 22:43:24 +0000 (23:43 +0100)
diff --git a/Doc/c-api/sys.rst b/Doc/c-api/sys.rst

index 035cdc16824886e78641514ba14008118592b733..48e2b2bb5e91a6f6cbfa454471fee4c1590ddfa9 100644 (file)
--- a/Doc/c-api/sys.rst
+++ b/Doc/c-api/sys.rst
@@ -66,9 +66,18 @@ Operating System Utilities
     surrogate character, escape the bytes using the surrogateescape error
     handler instead of decoding them.
  
+   Encoding, highest priority to lowest priority:
+
+   * ``UTF-8`` on macOS and Android;
+   * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
+     ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
+     and :c:func:`mbstowcs` and :c:func:`wcstombs` functions use the
+     ``ISO-8859-1`` encoding.
+   * the current locale encoding (``LC_CTYPE`` locale).
+
     Return a pointer to a newly allocated wide character string, use
     :c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write
-   the number of wide characters excluding the null character into ``*size``
+   the number of wide characters excluding the null character into ``*size``.
  
     Return ``NULL`` on decoding error or memory allocation error. If *size* is
     not ``NULL``, ``*size`` is set to ``(size_t)-1`` on memory error or set to
@@ -94,6 +103,15 @@ Operating System Utilities
     :ref:`surrogateescape error handler <surrogateescape>`: surrogate characters
     in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
  
+   Encoding, highest priority to lowest priority:
+
+   * ``UTF-8`` on macOS and Android;
+   * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
+     ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
+     and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the
+     ``ISO-8859-1`` encoding.
+   * the current locale encoding.
+
     Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free`
     to free the memory. Return ``NULL`` on encoding error or memory allocation
     error
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst

index 6e91576ee8f0045f0b953f46cfbf0374e7702112..b9acaec949b252e78b94759e440009265b2eb79a 100644 (file)
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -773,6 +773,12 @@ system.
  
     .. versionadded:: 3.3
  
+   .. versionchanged:: 3.6.5
+      The function now also uses the current locale encoding for the
+      ``surrogateescape`` error handler. Previously, :c:func:`Py_DecodeLocale`
+      was used for the ``surrogateescape``, and the current locale encoding was
+      used for ``strict``.
+
  
  .. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors)
  
@@ -800,6 +806,12 @@ system.
  
     .. versionadded:: 3.3
  
+   .. versionchanged:: 3.6.5
+      The function now also uses the current locale encoding for the
+      ``surrogateescape`` error handler. Previously, :c:func:`Py_EncodeLocale`
+      was used for the ``surrogateescape``, and the current locale encoding was
+      used for ``strict``.
+
  
  File System Encoding
  """"""""""""""""""""
diff --git a/Include/fileutils.h b/Include/fileutils.h

index 875715df97a58a90d7010dbebf184692ef8d420d..8fa70baa21cf98108891be31a391ae3ad22d88ae 100644 (file)
--- a/Include/fileutils.h
+++ b/Include/fileutils.h
@@ -17,6 +17,16 @@ PyAPI_FUNC(char*) Py_EncodeLocale(
  
  #ifndef Py_LIMITED_API
  
+PyAPI_FUNC(wchar_t *) _Py_DecodeLocaleEx(
+    const char *arg,
+    size_t *size,
+    int current_locale);
+
+PyAPI_FUNC(char*) _Py_EncodeLocaleEx(
+    const wchar_t *text,
+    size_t *error_pos,
+    int current_locale);
+
  PyAPI_FUNC(PyObject *) _Py_device_encoding(int);
  
  #ifdef MS_WINDOWS
diff --git a/Misc/NEWS.d/next/Library/2018-01-15-17-52-47.bpo-32555.CMq2zF.rst b/Misc/NEWS.d/next/Library/2018-01-15-17-52-47.bpo-32555.CMq2zF.rst

new file mode 100644 (file)

index 0000000..054f8ca
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-01-15-17-52-47.bpo-32555.CMq2zF.rst
@@ -0,0 +1,3 @@
+On FreeBSD and Solaris, os.strerror() now always decode the byte string from
+the current locale encoding, rather than using ASCII/surrogateescape in some
+cases.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 64905e84b149c9d1c393afa99c8a049d960d14bf..86cac96cd620bd080606bbbaa4ae0bf2d95ce109 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3439,8 +3439,9 @@ locale_error_handler(const char *errors, int *surrogateescape)
      }
  }
  
-PyObject *
-PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
+static PyObject *
+unicode_encode_locale(PyObject *unicode, const char *errors,
+                      int current_locale)
  {
      Py_ssize_t wlen, wlen2;
      wchar_t *wstr;
@@ -3469,7 +3470,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
          /* "surrogateescape" error handler */
          char *str;
  
-        str = Py_EncodeLocale(wstr, &error_pos);
+        str = _Py_EncodeLocaleEx(wstr, &error_pos, current_locale);
          if (str == NULL) {
              if (error_pos == (size_t)-1) {
                  PyErr_NoMemory();
@@ -3549,6 +3550,12 @@ encode_error:
      return NULL;
  }
  
+PyObject *
+PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
+{
+    return unicode_encode_locale(unicode, errors, 1);
+}
+
  PyObject *
  PyUnicode_EncodeFSDefault(PyObject *unicode)
  {
@@ -3571,7 +3578,8 @@ PyUnicode_EncodeFSDefault(PyObject *unicode)
                                           Py_FileSystemDefaultEncodeErrors);
      }
      else {
-        return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
+        return unicode_encode_locale(unicode,
+                                     Py_FileSystemDefaultEncodeErrors, 0);
      }
  #endif
  }
@@ -3741,9 +3749,9 @@ mbstowcs_errorpos(const char *str, size_t len)
      return 0;
  }
  
-PyObject*
-PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
-                              const char *errors)
+static PyObject*
+unicode_decode_locale(const char *str, Py_ssize_t len,
+                      const char *errors, int current_locale)
  {
      wchar_t smallbuf[256];
      size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
@@ -3766,7 +3774,7 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
  
      if (surrogateescape) {
          /* "surrogateescape" error handler */
-        wstr = Py_DecodeLocale(str, &wlen);
+        wstr = _Py_DecodeLocaleEx(str, &wlen, current_locale);
          if (wstr == NULL) {
              if (wlen == (size_t)-1)
                  PyErr_NoMemory();
@@ -3844,11 +3852,18 @@ decode_error:
      return NULL;
  }
  
+PyObject*
+PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t size,
+                              const char *errors)
+{
+    return unicode_decode_locale(str, size, errors, 1);
+}
+
  PyObject*
  PyUnicode_DecodeLocale(const char *str, const char *errors)
  {
      Py_ssize_t size = (Py_ssize_t)strlen(str);
-    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
+    return unicode_decode_locale(str, size, errors, 1);
  }
  
  
@@ -3880,7 +3895,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
                                  Py_FileSystemDefaultEncodeErrors);
      }
      else {
-        return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
+        return unicode_decode_locale(s, size,
+                                     Py_FileSystemDefaultEncodeErrors, 0);
      }
  #endif
  }
diff --git a/Python/fileutils.c b/Python/fileutils.c

index 14dd81b03f006de0c103eaec625881428e5c4759..7b87b72f73c8b461687cf2a4c0a26f60804524ef 100644 (file)
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -70,7 +70,10 @@ _Py_device_encoding(int fd)
      Py_RETURN_NONE;
  }
  
-#if !defined(__APPLE__) && !defined(MS_WINDOWS)
+#if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS)
+
+#define USE_FORCE_ASCII
+
  extern int _Py_normalize_encoding(const char *, char *, size_t);
  
  /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
@@ -221,7 +224,7 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos)
  }
  #endif   /* !defined(__APPLE__) && !defined(MS_WINDOWS) */
  
-#if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC))
+#if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII)
  static wchar_t*
  decode_ascii_surrogateescape(const char *arg, size_t *size)
  {
@@ -251,39 +254,9 @@ decode_ascii_surrogateescape(const char *arg, size_t *size)
  #endif
  
  
-/* Decode a byte string from the locale encoding with the
-   surrogateescape error handler: undecodable bytes are decoded as characters
-   in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
-   character, escape the bytes using the surrogateescape error handler instead
-   of decoding them.
-
-   Return a pointer to a newly allocated wide character string, use
-   PyMem_RawFree() to free the memory. If size is not NULL, write the number of
-   wide characters excluding the null character into *size
-
-   Return NULL on decoding error or memory allocation error. If *size* is not
-   NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
-   decoding error.
-
-   Decoding errors should never happen, unless there is a bug in the C
-   library.
-
-   Use the Py_EncodeLocale() function to encode the character string back to a
-   byte string. */
-wchar_t*
-Py_DecodeLocale(const char* arg, size_t *size)
+static wchar_t*
+decode_current_locale(const char* arg, size_t *size)
  {
-#if defined(__APPLE__) || defined(__ANDROID__)
-    wchar_t *wstr;
-    wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
-    if (size != NULL) {
-        if (wstr != NULL)
-            *size = wcslen(wstr);
-        else
-            *size = (size_t)-1;
-    }
-    return wstr;
-#else
      wchar_t *res;
      size_t argsize;
      size_t count;
@@ -293,19 +266,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
      mbstate_t mbs;
  #endif
  
-#ifndef MS_WINDOWS
-    if (force_ascii == -1)
-        force_ascii = check_force_ascii();
-
-    if (force_ascii) {
-        /* force ASCII encoding to workaround mbstowcs() issue */
-        res = decode_ascii_surrogateescape(arg, size);
-        if (res == NULL)
-            goto oom;
-        return res;
-    }
-#endif
-
  #ifdef HAVE_BROKEN_MBSTOWCS
      /* Some platforms have a broken implementation of
       * mbstowcs which does not count the characters that
@@ -402,72 +362,96 @@ Py_DecodeLocale(const char* arg, size_t *size)
          goto oom;
  #endif   /* HAVE_MBRTOWC */
      return res;
+
  oom:
      if (size != NULL)
          *size = (size_t)-1;
      return NULL;
+}
+
+
+static wchar_t*
+decode_locale(const char* arg, size_t *size, int current_locale)
+{
+    if (current_locale) {
+        return decode_current_locale(arg, size);
+    }
+
+#if defined(__APPLE__) || defined(__ANDROID__)
+    wchar_t *wstr;
+    wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
+    if (size != NULL) {
+        if (wstr != NULL)
+            *size = wcslen(wstr);
+        else
+            *size = (size_t)-1;
+    }
+    return wstr;
+#else
+
+#ifdef USE_FORCE_ASCII
+    if (force_ascii == -1) {
+        force_ascii = check_force_ascii();
+    }
+
+    if (force_ascii) {
+        /* force ASCII encoding to workaround mbstowcs() issue */
+        wchar_t *res = decode_ascii_surrogateescape(arg, size);
+        if (res == NULL) {
+            if (size != NULL)
+                *size = (size_t)-1;
+            return NULL;
+        }
+        return res;
+    }
+#endif
+
+    return decode_current_locale(arg, size);
  #endif   /* __APPLE__ or __ANDROID__ */
  }
  
-/* Encode a wide character string to the locale encoding with the
-   surrogateescape error handler: surrogate characters in the range
-   U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
  
-   Return a pointer to a newly allocated byte string, use PyMem_Free() to free
-   the memory. Return NULL on encoding or memory allocation error.
+/* Decode a byte string from the locale encoding with the
+   surrogateescape error handler: undecodable bytes are decoded as characters
+   in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
+   character, escape the bytes using the surrogateescape error handler instead
+   of decoding them.
  
-   If error_pos is not NULL, *error_pos is set to the index of the invalid
-   character on encoding error, or set to (size_t)-1 otherwise.
+   Return a pointer to a newly allocated wide character string, use
+   PyMem_RawFree() to free the memory. If size is not NULL, write the number of
+   wide characters excluding the null character into *size
  
-   Use the Py_DecodeLocale() function to decode the bytes string back to a wide
-   character string. */
-char*
-Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
+   Return NULL on decoding error or memory allocation error. If *size* is not
+   NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
+   decoding error.
+
+   Decoding errors should never happen, unless there is a bug in the C
+   library.
+
+   Use the Py_EncodeLocale() function to encode the character string back to a
+   byte string. */
+wchar_t*
+Py_DecodeLocale(const char* arg, size_t *size)
  {
-#if defined(__APPLE__) || defined(__ANDROID__)
-    Py_ssize_t len;
-    PyObject *unicode, *bytes = NULL;
-    char *cpath;
+    return decode_locale(arg, size, 0);
+}
  
-    unicode = PyUnicode_FromWideChar(text, wcslen(text));
-    if (unicode == NULL)
-        return NULL;
  
-    bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
-    Py_DECREF(unicode);
-    if (bytes == NULL) {
-        PyErr_Clear();
-        if (error_pos != NULL)
-            *error_pos = (size_t)-1;
-        return NULL;
-    }
+wchar_t*
+_Py_DecodeLocaleEx(const char* arg, size_t *size, int current_locale)
+{
+    return decode_locale(arg, size, current_locale);
+}
  
-    len = PyBytes_GET_SIZE(bytes);
-    cpath = PyMem_Malloc(len+1);
-    if (cpath == NULL) {
-        PyErr_Clear();
-        Py_DECREF(bytes);
-        if (error_pos != NULL)
-            *error_pos = (size_t)-1;
-        return NULL;
-    }
-    memcpy(cpath, PyBytes_AsString(bytes), len + 1);
-    Py_DECREF(bytes);
-    return cpath;
-#else   /* __APPLE__ */
+
+static char*
+encode_current_locale(const wchar_t *text, size_t *error_pos)
+{
      const size_t len = wcslen(text);
      char *result = NULL, *bytes = NULL;
      size_t i, size, converted;
      wchar_t c, buf[2];
  
-#ifndef MS_WINDOWS
-    if (force_ascii == -1)
-        force_ascii = check_force_ascii();
-
-    if (force_ascii)
-        return encode_ascii_surrogateescape(text, error_pos);
-#endif
-
      /* The function works in two steps:
         1. compute the length of the output buffer in bytes (size)
         2. outputs the bytes */
@@ -522,10 +506,89 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
          bytes = result;
      }
      return result;
+}
+
+
+static char*
+encode_locale(const wchar_t *text, size_t *error_pos, int current_locale)
+{
+    if (current_locale) {
+        return encode_current_locale(text, error_pos);
+    }
+
+#if defined(__APPLE__) || defined(__ANDROID__)
+    Py_ssize_t len;
+    PyObject *unicode, *bytes = NULL;
+    char *cpath;
+
+    unicode = PyUnicode_FromWideChar(text, wcslen(text));
+    if (unicode == NULL)
+        return NULL;
+
+    bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
+    Py_DECREF(unicode);
+    if (bytes == NULL) {
+        PyErr_Clear();
+        if (error_pos != NULL)
+            *error_pos = (size_t)-1;
+        return NULL;
+    }
+
+    len = PyBytes_GET_SIZE(bytes);
+    cpath = PyMem_Malloc(len+1);
+    if (cpath == NULL) {
+        PyErr_Clear();
+        Py_DECREF(bytes);
+        if (error_pos != NULL)
+            *error_pos = (size_t)-1;
+        return NULL;
+    }
+    memcpy(cpath, PyBytes_AsString(bytes), len + 1);
+    Py_DECREF(bytes);
+    return cpath;
+#else   /* __APPLE__ */
+
+#ifdef USE_FORCE_ASCII
+    if (force_ascii == -1) {
+        force_ascii = check_force_ascii();
+    }
+
+    if (force_ascii) {
+        return encode_ascii_surrogateescape(text, error_pos);
+    }
+#endif
+
+    return encode_current_locale(text, error_pos);
  #endif   /* __APPLE__ or __ANDROID__ */
  }
  
  
+/* Encode a wide character string to the locale encoding with the
+   surrogateescape error handler: surrogate characters in the range
+   U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
+
+   Return a pointer to a newly allocated byte string, use PyMem_Free() to free
+   the memory. Return NULL on encoding or memory allocation error.
+
+   If error_pos is not NULL, *error_pos is set to the index of the invalid
+   character on encoding error, or set to (size_t)-1 otherwise.
+
+   Use the Py_DecodeLocale() function to decode the bytes string back to a wide
+   character string. */
+char*
+Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
+{
+    return encode_locale(text, error_pos, 0);
+}
+
+
+char*
+_Py_EncodeLocaleEx(const wchar_t *text, size_t *error_pos, int current_locale)
+{
+    return encode_locale(text, error_pos, current_locale);
+}
+
+
  #ifdef MS_WINDOWS
  static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */
author	Victor Stinner <victor.stinner@gmail.com>
	Mon, 15 Jan 2018 22:43:24 +0000 (23:43 +0100)
committer	GitHub <noreply@github.com>
	Mon, 15 Jan 2018 22:43:24 +0000 (23:43 +0100)
Doc/c-api/sys.rst		patch \| blob \| history
Doc/c-api/unicode.rst		patch \| blob \| history
Include/fileutils.h		patch \| blob \| history
Misc/NEWS.d/next/Library/2018-01-15-17-52-47.bpo-32555.CMq2zF.rst	[new file with mode: 0644]	patch \| blob
Objects/unicodeobject.c		patch \| blob \| history
Python/fileutils.c		patch \| blob \| history