Issue #13560: Add PyUnicode_EncodeLocale()

author Victor Stinner <victor.stinner@haypocalc.com>

Sat, 17 Dec 2011 03:13:41 +0000 (04:13 +0100)

committer Victor Stinner <victor.stinner@haypocalc.com>

Sat, 17 Dec 2011 03:13:41 +0000 (04:13 +0100)
author Victor Stinner <victor.stinner@haypocalc.com>
Sat, 17 Dec 2011 03:13:41 +0000 (04:13 +0100)
committer Victor Stinner <victor.stinner@haypocalc.com>
Sat, 17 Dec 2011 03:13:41 +0000 (04:13 +0100)
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst

index 0bf2eea6f100f0e953916781e0e2410457c89b25..a6f3a69bfe3ec62a935aa019b3c3c2cf5953660a 100644 (file)
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -713,7 +713,7 @@ system.
     bytes. If a byte sequence can be decoded as a surrogate character and
     *surrogateescape* is not equal to zero, the byte sequence is escaped using
     the ``'surrogateescape'`` error handler instead of being decoded.  *str*
-   must end with a null character but cannot contain embedded null character.
+   must end with a null character but cannot contain embedded null characters.
  
     .. seealso::
  
@@ -732,6 +732,22 @@ system.
     .. versionadded:: 3.3
  
  
+.. c:function:: PyObject* PyUnicode_EncodeLocale(PyObject *unicode, int surrogateescape)
+
+   Encode a Unicode object to the current locale encoding. The encoder is
+   strict if *surrogateescape* is equal to zero, otherwise it uses the
+   ``'surrogateescape'`` error handler (:pep:`383`). Return a :class:`bytes`
+   object. *str* cannot contain embedded null characters.
+
+   .. seealso::
+
+      Use :c:func:`PyUnicode_EncodeFSDefault` to encode a string to
+      :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
+      Python startup).
+
+   .. versionadded:: 3.3
+
+
  File System Encoding
  """"""""""""""""""""
  
@@ -806,6 +822,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function:
     If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the
     locale encoding.
  
+   .. seealso::
+
+      :c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the
+      locale encoding and cannot be modified later. If you need to encode a
+      string to the current locale encoding, use
+      :c:func:`PyUnicode_EncodeLocale`.
+
     .. versionadded:: 3.2
  
  
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h

index 5f073e0625acdec0ce6af6ace151ebd6bff95c1b..8a23c7dc88a2da5ac73f26c0052d478cb0837191 100644 (file)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1603,7 +1603,7 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
     be decoded as a surrogate character and *surrogateescape* is not equal to
     zero, the byte sequence is escaped using the 'surrogateescape' error handler
     instead of being decoded. *str* must end with a null character but cannot
-   contain embedded null character. */
+   contain embedded null characters. */
  
  PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
      const char *str,
@@ -1617,6 +1617,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
      const char *str,
      int surrogateescape);
  
+/* Encode a Unicode object to the current locale encoding. The encoder is
+   strict is *surrogateescape* is equal to zero, otherwise the
+   "surrogateescape" error handler is used. Return a bytes object. The string
+   cannot contain embedded null characters.. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
+    PyObject *unicode,
+    int surrogateescape
+    );
+
  /* --- File system encoding ---------------------------------------------- */
  
  /* ParseTuple converter: encode str objects to bytes using
diff --git a/Misc/NEWS b/Misc/NEWS

index 5be6990b0e5cdbfdbd363e390b08ed7106baac9c..51505d45f986cb48adf997e3c887e746ad4abd25 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -419,6 +419,10 @@ Core and Builtins
  Library
  -------
  
+- Issue #13560: Add PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize()
+  and PyUnicode_EncodeLocale() functions to the C API to decode/encode from/to
+  the current locale encoding.
+
  - Issue #8373: The filesystem path of AF_UNIX sockets now uses the filesystem
    encoding and the surrogateescape error handler, rather than UTF-8.  Patch
    by David Watson.
@@ -451,8 +455,8 @@ Library
    'importlib.abc.PyPycLoader', 'nntplib.NNTP.xgtitle', 'nntplib.NNTP.xpath',
    and private attributes of 'smtpd.SMTPChannel'.
  
-- Issue #5905: time.strftime() is now using the locale encoding, instead of
-  UTF-8, if the wcsftime() function is not available.
+- Issue #5905, #13560: time.strftime() is now using the current locale
+  encoding, instead of UTF-8, if the wcsftime() function is not available.
  
  - Issue #8641: Update IDLE 3 syntax coloring to recognize b".." and not u"..".
    Patch by Tal Einat.
diff --git a/Modules/timemodule.c b/Modules/timemodule.c

index a46c4f11e40c57a7b61e524a4f4388481df6419b..ad1c54e85e812598a568edcda3b729d5f0632da4 100644 (file)
--- a/Modules/timemodule.c
+++ b/Modules/timemodule.c
@@ -486,7 +486,7 @@ time_strftime(PyObject *self, PyObject *args)
      fmt = format;
  #else
      /* Convert the unicode string to an ascii one */
-    format = PyUnicode_EncodeFSDefault(format_arg);
+    format = PyUnicode_EncodeLocale(format_arg, 1);
      if (format == NULL)
          return NULL;
      fmt = PyBytes_AS_STRING(format);
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 7444c8b4ba0c120b8d9024a04cfaeb3298f689e7..a2c3227df0b136eb0d83a6ab5c3a3ef3c3c93726 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3073,6 +3073,140 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
      return NULL;
  }
  
+static size_t
+wcstombs_errorpos(const wchar_t *wstr)
+{
+    size_t len;
+#if SIZEOF_WCHAR_T == 2
+    wchar_t buf[3];
+#else
+    wchar_t buf[2];
+#endif
+    char outbuf[MB_LEN_MAX];
+    const wchar_t *start, *previous;
+    int save_errno;
+
+    save_errno = errno;
+#if SIZEOF_WCHAR_T == 2
+    buf[2] = 0;
+#else
+    buf[1] = 0;
+#endif
+    start = wstr;
+    while (*wstr != L'\0')
+    {
+        previous = wstr;
+#if SIZEOF_WCHAR_T == 2
+        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
+            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
+        {
+            buf[0] = wstr[0];
+            buf[1] = wstr[1];
+            wstr += 2;
+        }
+        else {
+            buf[0] = *wstr;
+            buf[1] = 0;
+            wstr++;
+        }
+#else
+        buf[0] = *wstr;
+        wstr++;
+#endif
+        len = wcstombs(outbuf, buf, sizeof(outbuf));
+        if (len == (size_t)-1) {
+            errno = save_errno;
+            return previous - start;
+        }
+    }
+
+    /* failed to find the unencodable character */
+    errno = save_errno;
+    return 0;
+}
+
+PyObject *
+PyUnicode_EncodeLocale(PyObject *unicode, int surrogateescape)
+{
+    Py_ssize_t wlen, wlen2;
+    wchar_t *wstr;
+    PyObject *bytes = NULL;
+    char *errmsg;
+    PyObject *exc;
+    size_t error_pos;
+
+    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
+    if (wstr == NULL)
+        return NULL;
+
+    wlen2 = wcslen(wstr);
+    if (wlen2 != wlen) {
+        PyMem_Free(wstr);
+        PyErr_SetString(PyExc_TypeError, "embedded null character");
+        return NULL;
+    }
+
+    if (surrogateescape) {
+        /* locale encoding with surrogateescape */
+        char *str;
+
+        str = _Py_wchar2char(wstr, &error_pos);
+        if (str == NULL) {
+            if (error_pos == (size_t)-1) {
+                PyErr_NoMemory();
+                PyMem_Free(wstr);
+                return NULL;
+            }
+            else {
+                goto encode_error;
+            }
+        }
+        PyMem_Free(wstr);
+
+        bytes = PyBytes_FromString(str);
+        PyMem_Free(str);
+    }
+    else {
+        size_t len, len2;
+
+        len = wcstombs(NULL, wstr, 0);
+        if (len == (size_t)-1) {
+            error_pos = wcstombs_errorpos(wstr);
+            goto encode_error;
+        }
+
+        bytes = PyBytes_FromStringAndSize(NULL, len);
+        if (bytes == NULL) {
+            PyMem_Free(wstr);
+            return NULL;
+        }
+
+        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
+        if (len2 == (size_t)-1 || len2 > len) {
+            error_pos = wcstombs_errorpos(wstr);
+            goto encode_error;
+        }
+        PyMem_Free(wstr);
+    }
+    return bytes;
+
+encode_error:
+    errmsg = strerror(errno);
+    assert(errmsg != NULL);
+    if (errmsg == NULL)
+        errmsg = "wcstombs() encountered an unencodable wide character";
+    PyMem_Free(wstr);
+    Py_XDECREF(bytes);
+
+    exc = NULL;
+    raise_encode_exception(&exc,
+        "locale", unicode,
+        error_pos, error_pos+1,
+        errmsg);
+    Py_XDECREF(exc);
+    return NULL;
+}
+
  PyObject *
  PyUnicode_EncodeFSDefault(PyObject *unicode)
  {
@@ -3097,38 +3231,7 @@ PyUnicode_EncodeFSDefault(PyObject *unicode)
                                           "surrogateescape");
      }
      else {
-        /* locale encoding with surrogateescape */
-        wchar_t *wchar;
-        char *bytes;
-        PyObject *bytes_obj;
-        size_t error_pos;
-
-        wchar = PyUnicode_AsWideCharString(unicode, NULL);
-        if (wchar == NULL)
-            return NULL;
-        bytes = _Py_wchar2char(wchar, &error_pos);
-        if (bytes == NULL) {
-            if (error_pos != (size_t)-1) {
-                char *errmsg = strerror(errno);
-                PyObject *exc = NULL;
-                if (errmsg == NULL)
-                    errmsg = "Py_wchar2char() failed";
-                raise_encode_exception(&exc,
-                    "filesystemencoding", unicode,
-                    error_pos, error_pos+1,
-                    errmsg);
-                Py_XDECREF(exc);
-            }
-            else
-                PyErr_NoMemory();
-            PyMem_Free(wchar);
-            return NULL;
-        }
-        PyMem_Free(wchar);
-
-        bytes_obj = PyBytes_FromString(bytes);
-        PyMem_Free(bytes);
-        return bytes_obj;
+        return PyUnicode_EncodeLocale(unicode, 1);
      }
  #endif
  }
author	Victor Stinner <victor.stinner@haypocalc.com>
	Sat, 17 Dec 2011 03:13:41 +0000 (04:13 +0100)
committer	Victor Stinner <victor.stinner@haypocalc.com>
	Sat, 17 Dec 2011 03:13:41 +0000 (04:13 +0100)
Doc/c-api/unicode.rst		patch \| blob \| history
Include/unicodeobject.h		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Modules/timemodule.c		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history