bpo-29240: readline now ignores the UTF-8 Mode (#5145)

author Victor Stinner <victor.stinner@gmail.com>

Wed, 10 Jan 2018 21:46:15 +0000 (22:46 +0100)

committer GitHub <noreply@github.com>

Wed, 10 Jan 2018 21:46:15 +0000 (22:46 +0100)
author Victor Stinner <victor.stinner@gmail.com>
Wed, 10 Jan 2018 21:46:15 +0000 (22:46 +0100)
committer GitHub <noreply@github.com>
Wed, 10 Jan 2018 21:46:15 +0000 (22:46 +0100)
diff --git a/Include/fileutils.h b/Include/fileutils.h

index d027e18348fbb347cd69a2707715badf1558888e..2527d84669d1e1de8a8543bb450de67a8fbe7673 100644 (file)
--- a/Include/fileutils.h
+++ b/Include/fileutils.h
@@ -24,6 +24,14 @@ PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
      const char *s,
      Py_ssize_t size,
      size_t *p_wlen);
+
+PyAPI_FUNC(wchar_t *) _Py_DecodeCurrentLocale(
+    const char *arg,
+    size_t *size);
+
+PyAPI_FUNC(char*) _Py_EncodeCurrentLocale(
+    const wchar_t *text,
+    size_t *error_pos);
  #endif
  
  #ifndef Py_LIMITED_API
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h

index 0274de6733ab0e0ce2243aa530ae1815ca61480a..576e7ad8510c5073039d75e080c5729582d1d1a1 100644 (file)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1810,6 +1810,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
      PyObject *unicode,
      const char *errors
      );
+
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocaleAndSize(
+    const char *str,
+    Py_ssize_t len,
+    const char *errors);
+
+PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCurrentLocale(
+    PyObject *unicode,
+    const char *errors
+    );
  #endif
  
  /* --- File system encoding ---------------------------------------------- */
diff --git a/Lib/test/test_readline.py b/Lib/test/test_readline.py

index 28ea38b747ed53e7e94d8af664371f949586ea7b..b4c25dee9d3c20a9f0e3e4986bc32677ed7ecd70 100644 (file)
--- a/Lib/test/test_readline.py
+++ b/Lib/test/test_readline.py
@@ -152,8 +152,6 @@ print("History length:", readline.get_current_history_length())
          output = run_pty(self.auto_history_script.format(False))
          self.assertIn(b"History length: 0\r\n", output)
  
-    @unittest.skipIf(True,
-                     "FIXME: test broken by bpo-29240")
      def test_nonascii(self):
          try:
              readline.add_history("\xEB\xEF")
diff --git a/Modules/readline.c b/Modules/readline.c

index 811fca8cd92a7233512781a484292349835e24f7..8db4cfd01524a33c77e4142537c758dd8b2addd9 100644 (file)
--- a/Modules/readline.c
+++ b/Modules/readline.c
@@ -132,13 +132,14 @@ static PyModuleDef readlinemodule;
  static PyObject *
  encode(PyObject *b)
  {
-    return PyUnicode_EncodeLocale(b, "surrogateescape");
+    return _PyUnicode_EncodeCurrentLocale(b, "surrogateescape");
  }
  
  static PyObject *
  decode(const char *s)
  {
-    return PyUnicode_DecodeLocale(s, "surrogateescape");
+    return _PyUnicode_DecodeCurrentLocaleAndSize(s, strlen(s),
+                                                 "surrogateescape");
  }
  
  
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 92a6ad6b97996e60c7145eef3cfabc8039013946..1a230e03e639c9e0bfaad63e6a3e30efcd089b08 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3395,8 +3395,8 @@ locale_error_handler(const char *errors, int *surrogateescape)
      }
  }
  
-PyObject *
-PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
+static PyObject *
+unicode_encode_locale(PyObject *unicode, const char *errors, int current_locale)
  {
      Py_ssize_t wlen, wlen2;
      wchar_t *wstr;
@@ -3423,7 +3423,12 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
          /* "surrogateescape" error handler */
          char *str;
  
-        str = Py_EncodeLocale(wstr, &error_pos);
+        if (current_locale) {
+            str = _Py_EncodeCurrentLocale(wstr, &error_pos);
+        }
+        else {
+            str = Py_EncodeLocale(wstr, &error_pos);
+        }
          if (str == NULL) {
              if (error_pos == (size_t)-1) {
                  PyErr_NoMemory();
@@ -3437,7 +3442,12 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
          PyMem_Free(wstr);
  
          bytes = PyBytes_FromString(str);
-        PyMem_Free(str);
+        if (current_locale) {
+            PyMem_RawFree(str);
+        }
+        else {
+            PyMem_Free(str);
+        }
      }
      else {
          /* strict mode */
@@ -3502,6 +3512,18 @@ encode_error:
      return NULL;
  }
  
+PyObject *
+PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
+{
+    return unicode_encode_locale(unicode, errors, 0);
+}
+
+PyObject *
+_PyUnicode_EncodeCurrentLocale(PyObject *unicode, const char *errors)
+{
+    return unicode_encode_locale(unicode, errors, 1);
+}
+
  PyObject *
  PyUnicode_EncodeFSDefault(PyObject *unicode)
  {
@@ -3524,7 +3546,8 @@ PyUnicode_EncodeFSDefault(PyObject *unicode)
                                           Py_FileSystemDefaultEncodeErrors);
      }
      else {
-        return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
+        return unicode_encode_locale(unicode,
+                                     Py_FileSystemDefaultEncodeErrors, 0);
      }
  #endif
  }
@@ -3695,9 +3718,9 @@ mbstowcs_errorpos(const char *str, size_t len)
      return 0;
  }
  
-PyObject*
-PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
-                              const char *errors)
+static PyObject*
+unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
+                      int current_locale)
  {
      wchar_t smallbuf[256];
      size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
@@ -3719,7 +3742,12 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
  
      if (surrogateescape) {
          /* "surrogateescape" error handler */
-        wstr = Py_DecodeLocale(str, &wlen);
+        if (current_locale) {
+            wstr = _Py_DecodeCurrentLocale(str, &wlen);
+        }
+        else {
+            wstr = Py_DecodeLocale(str, &wlen);
+        }
          if (wstr == NULL) {
              if (wlen == (size_t)-1)
                  PyErr_NoMemory();
@@ -3794,11 +3822,25 @@ decode_error:
      return NULL;
  }
  
+PyObject*
+PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
+                              const char *errors)
+{
+    return unicode_decode_locale(str, len, errors, 0);
+}
+
+PyObject*
+_PyUnicode_DecodeCurrentLocaleAndSize(const char *str, Py_ssize_t len,
+                                      const char *errors)
+{
+    return unicode_decode_locale(str, len, errors, 1);
+}
+
  PyObject*
  PyUnicode_DecodeLocale(const char *str, const char *errors)
  {
      Py_ssize_t size = (Py_ssize_t)strlen(str);
-    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
+    return unicode_decode_locale(str, size, errors, 0);
  }
  
  
diff --git a/Python/fileutils.c b/Python/fileutils.c

index 645a1793664a1cbee9cab0b8102c1f63316a20ec..9275494e8644419ee701488f686ccd92e7de88a1 100644 (file)
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -263,7 +263,7 @@ decode_ascii_surrogateescape(const char *arg, size_t *size)
  
  #if !defined(__APPLE__) && !defined(__ANDROID__)
  static wchar_t*
-decode_locale(const char* arg, size_t *size)
+decode_current_locale(const char* arg, size_t *size)
  {
      wchar_t *res;
      size_t argsize;
@@ -380,32 +380,13 @@ oom:
  #endif
  
  
-/* Decode a byte string from the locale encoding with the
-   surrogateescape error handler: undecodable bytes are decoded as characters
-   in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
-   character, escape the bytes using the surrogateescape error handler instead
-   of decoding them.
-
-   Return a pointer to a newly allocated wide character string, use
-   PyMem_RawFree() to free the memory. If size is not NULL, write the number of
-   wide characters excluding the null character into *size
-
-   Return NULL on decoding error or memory allocation error. If *size* is not
-   NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
-   decoding error.
-
-   Decoding errors should never happen, unless there is a bug in the C
-   library.
-
-   Use the Py_EncodeLocale() function to encode the character string back to a
-   byte string. */
-wchar_t*
-Py_DecodeLocale(const char* arg, size_t *size)
+static wchar_t*
+decode_locale(const char* arg, size_t *size, int ignore_utf8_mode)
  {
  #if defined(__APPLE__) || defined(__ANDROID__)
      return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
  #else
-    if (Py_UTF8Mode == 1) {
+    if (!ignore_utf8_mode && Py_UTF8Mode == 1) {
          return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
      }
  
@@ -426,11 +407,45 @@ Py_DecodeLocale(const char* arg, size_t *size)
      }
  #endif
  
-    return decode_locale(arg, size);
+    return decode_current_locale(arg, size);
  #endif   /* __APPLE__ or __ANDROID__ */
  }
  
  
+/* Decode a byte string from the locale encoding with the
+   surrogateescape error handler: undecodable bytes are decoded as characters
+   in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
+   character, escape the bytes using the surrogateescape error handler instead
+   of decoding them.
+
+   Return a pointer to a newly allocated wide character string, use
+   PyMem_RawFree() to free the memory. If size is not NULL, write the number of
+   wide characters excluding the null character into *size
+
+   Return NULL on decoding error or memory allocation error. If *size* is not
+   NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
+   decoding error.
+
+   Decoding errors should never happen, unless there is a bug in the C
+   library.
+
+   Use the Py_EncodeLocale() function to encode the character string back to a
+   byte string. */
+wchar_t*
+Py_DecodeLocale(const char* arg, size_t *size)
+{
+    return decode_locale(arg, size, 0);
+}
+
+
+/* Similar to Py_DecodeLocale() but ignore the UTF-8 mode */
+wchar_t*
+_Py_DecodeCurrentLocale(const char* arg, size_t *size)
+{
+    return decode_locale(arg, size, 1);
+}
+
+
  #if !defined(__APPLE__) && !defined(__ANDROID__)
  static char*
  encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
@@ -508,12 +523,13 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
  #endif
  
  static char*
-encode_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
+encode_locale(const wchar_t *text, size_t *error_pos,
+              int raw_malloc, int ignore_utf8_mode)
  {
  #if defined(__APPLE__) || defined(__ANDROID__)
      return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc);
  #else   /* __APPLE__ */
-    if (Py_UTF8Mode == 1) {
+    if (!ignore_utf8_mode && Py_UTF8Mode == 1) {
          return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc);
      }
  
@@ -544,7 +560,7 @@ encode_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
  char*
  Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
  {
-    return encode_locale(text, error_pos, 0);
+    return encode_locale(text, error_pos, 0, 0);
  }
  
  
@@ -553,7 +569,15 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
  char*
  _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos)
  {
-    return encode_locale(text, error_pos, 1);
+    return encode_locale(text, error_pos, 1, 0);
+}
+
+
+/* Similar to _Py_EncodeLocaleRaw() but ignore the UTF-8 Mode */
+char*
+_Py_EncodeCurrentLocale(const wchar_t *text, size_t *error_pos)
+{
+    return encode_locale(text, error_pos, 1, 1);
  }
author	Victor Stinner <victor.stinner@gmail.com>
	Wed, 10 Jan 2018 21:46:15 +0000 (22:46 +0100)
committer	GitHub <noreply@github.com>
	Wed, 10 Jan 2018 21:46:15 +0000 (22:46 +0100)
Include/fileutils.h		patch \| blob \| history
Include/unicodeobject.h		patch \| blob \| history
Lib/test/test_readline.py		patch \| blob \| history
Modules/readline.c		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history
Python/fileutils.c		patch \| blob \| history