Add PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale()

author Victor Stinner <victor.stinner@haypocalc.com>

Fri, 16 Dec 2011 22:56:01 +0000 (23:56 +0100)

committer Victor Stinner <victor.stinner@haypocalc.com>

Fri, 16 Dec 2011 22:56:01 +0000 (23:56 +0100)
author Victor Stinner <victor.stinner@haypocalc.com>
Fri, 16 Dec 2011 22:56:01 +0000 (23:56 +0100)
committer Victor Stinner <victor.stinner@haypocalc.com>
Fri, 16 Dec 2011 22:56:01 +0000 (23:56 +0100)
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst

index 81ed54045868e75853f9f566fec1f3009bc9fce2..0bf2eea6f100f0e953916781e0e2410457c89b25 100644 (file)
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -699,6 +699,39 @@ Extension modules can continue using them, as they will not be removed in Python
     throughout the interpreter whenever coercion to Unicode is needed.
  
  
+Locale Encoding
+"""""""""""""""
+
+The current locale encoding can be used to decode text from the operating
+system.
+
+.. c:function:: PyObject* PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, int surrogateescape)
+
+   Decode a string from the current locale encoding. The decoder is strict if
+   *surrogateescape* is equal to zero, otherwise it uses the
+   ``'surrogateescape'`` error handler (:pep:`383`) to escape undecodable
+   bytes. If a byte sequence can be decoded as a surrogate character and
+   *surrogateescape* is not equal to zero, the byte sequence is escaped using
+   the ``'surrogateescape'`` error handler instead of being decoded.  *str*
+   must end with a null character but cannot contain embedded null character.
+
+   .. seealso::
+
+      Use :c:func:`PyUnicode_DecodeFSDefaultAndSize` to decode a string from
+      :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
+      Python startup).
+
+   .. versionadded:: 3.3
+
+
+.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, int surrogateescape)
+
+   Similar to :c:func:`PyUnicode_DecodeLocaleAndSize`, but compute the string
+   length using :c:func:`strlen`.
+
+   .. versionadded:: 3.3
+
+
  File System Encoding
  """"""""""""""""""""
  
@@ -739,6 +772,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function:
     If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the
     locale encoding.
  
+   .. seealso::
+
+      :c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the
+      locale encoding and cannot be modified later. If you need to decode a
+      string from the current locale encoding, use
+      :c:func:`PyUnicode_DecodeLocaleAndSize`.
+
     .. versionchanged:: 3.2
        Use ``'strict'`` error handler on Windows.
  
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h

index cd35ae629a95e9fa9dff5580340388cb120e5cbe..5f073e0625acdec0ce6af6ace151ebd6bff95c1b 100644 (file)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1595,6 +1595,28 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
      );
  #endif
  
+/* --- Locale encoding --------------------------------------------------- */
+
+/* Decode a string from the current locale encoding. The decoder is strict if
+   *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
+   error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
+   be decoded as a surrogate character and *surrogateescape* is not equal to
+   zero, the byte sequence is escaped using the 'surrogateescape' error handler
+   instead of being decoded. *str* must end with a null character but cannot
+   contain embedded null character. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
+    const char *str,
+    Py_ssize_t len,
+    int surrogateescape);
+
+/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
+   length using strlen(). */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
+    const char *str,
+    int surrogateescape);
+
  /* --- File system encoding ---------------------------------------------- */
  
  /* ParseTuple converter: encode str objects to bytes using
diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c

index 9bba1b39cf7c3355fd6a29b57ac2c6fe664e6aaf..1cab7c0a7482bed99b1d15558f22749de72ccc78 100644 (file)
--- a/Modules/_localemodule.c
+++ b/Modules/_localemodule.c
@@ -42,43 +42,6 @@ PyDoc_STRVAR(locale__doc__, "Support for POSIX locales.");
  
  static PyObject *Error;
  
-/* Convert a char* to a Unicode object according to the current locale */
-static PyObject*
-str2uni(const char* s)
-{
-#ifdef HAVE_BROKEN_MBSTOWCS
-    size_t needed = strlen(s);
-#else
-    size_t needed = mbstowcs(NULL, s, 0);
-#endif
-    size_t res1;
-    wchar_t smallbuf[30];
-    wchar_t *dest;
-    PyObject *res2;
-    if (needed == (size_t)-1) {
-        PyErr_SetString(PyExc_ValueError, "Cannot convert byte to string");
-        return NULL;
-    }
-    if (needed*sizeof(wchar_t) < sizeof(smallbuf))
-        dest = smallbuf;
-    else {
-        dest = PyMem_Malloc((needed+1)*sizeof(wchar_t));
-        if (!dest)
-            return PyErr_NoMemory();
-    }
-    /* This shouldn't fail now */
-    res1 = mbstowcs(dest, s, needed+1);
-#ifdef HAVE_BROKEN_MBSTOWCS
-    assert(res1 != (size_t)-1);
-#else
-    assert(res1 == needed);
-#endif
-    res2 = PyUnicode_FromWideChar(dest, res1);
-    if (dest != smallbuf)
-        PyMem_Free(dest);
-    return res2;
-}
-
  /* support functions for formatting floating point numbers */
  
  PyDoc_STRVAR(setlocale__doc__,
@@ -149,7 +112,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
              PyErr_SetString(Error, "unsupported locale setting");
              return NULL;
          }
-        result_object = str2uni(result);
+        result_object = PyUnicode_DecodeLocale(result, 0);
          if (!result_object)
              return NULL;
      } else {
@@ -159,7 +122,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
              PyErr_SetString(Error, "locale query failed");
              return NULL;
          }
-        result_object = str2uni(result);
+        result_object = PyUnicode_DecodeLocale(result, 0);
      }
      return result_object;
  }
@@ -185,7 +148,7 @@ PyLocale_localeconv(PyObject* self)
         involved herein */
  
  #define RESULT_STRING(s)\
-    x = str2uni(l->s);   \
+    x = PyUnicode_DecodeLocale(l->s, 0);   \
      if (!x) goto failed;\
      PyDict_SetItemString(result, #s, x);\
      Py_XDECREF(x)
@@ -476,7 +439,7 @@ PyLocale_nl_langinfo(PyObject* self, PyObject* args)
                 instead of an empty string for nl_langinfo(ERA).  */
              const char *result = nl_langinfo(item);
              result = result != NULL ? result : "";
-            return str2uni(result);
+            return PyUnicode_DecodeLocale(result, 0);
          }
      PyErr_SetString(PyExc_ValueError, "unsupported langinfo constant");
      return NULL;
@@ -495,7 +458,7 @@ PyIntl_gettext(PyObject* self, PyObject *args)
      char *in;
      if (!PyArg_ParseTuple(args, "s", &in))
          return 0;
-    return str2uni(gettext(in));
+    return PyUnicode_DecodeLocale(gettext(in), 0);
  }
  
  PyDoc_STRVAR(dgettext__doc__,
@@ -508,7 +471,7 @@ PyIntl_dgettext(PyObject* self, PyObject *args)
      char *domain, *in;
      if (!PyArg_ParseTuple(args, "zs", &domain, &in))
          return 0;
-    return str2uni(dgettext(domain, in));
+    return PyUnicode_DecodeLocale(dgettext(domain, in), 0);
  }
  
  PyDoc_STRVAR(dcgettext__doc__,
@@ -522,7 +485,7 @@ PyIntl_dcgettext(PyObject *self, PyObject *args)
      int category;
      if (!PyArg_ParseTuple(args, "zsi", &domain, &msgid, &category))
          return 0;
-    return str2uni(dcgettext(domain,msgid,category));
+    return PyUnicode_DecodeLocale(dcgettext(domain,msgid,category), 0);
  }
  
  PyDoc_STRVAR(textdomain__doc__,
@@ -540,7 +503,7 @@ PyIntl_textdomain(PyObject* self, PyObject* args)
          PyErr_SetFromErrno(PyExc_OSError);
          return NULL;
      }
-    return str2uni(domain);
+    return PyUnicode_DecodeLocale(domain, 0);
  }
  
  PyDoc_STRVAR(bindtextdomain__doc__,
@@ -572,7 +535,7 @@ PyIntl_bindtextdomain(PyObject* self,PyObject*args)
          PyErr_SetFromErrno(PyExc_OSError);
          return NULL;
      }
-    result = str2uni(current_dirname);
+    result = PyUnicode_DecodeLocale(current_dirname, 0);
      Py_XDECREF(dirname_bytes);
      return result;
  }
@@ -590,7 +553,7 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args)
          return NULL;
      codeset = bind_textdomain_codeset(domain, codeset);
      if (codeset)
-        return str2uni(codeset);
+        return PyUnicode_DecodeLocale(codeset, 0);
      Py_RETURN_NONE;
  }
  #endif
diff --git a/Modules/main.c b/Modules/main.c

index d4c3314d24f998c83d790e236a0404376ff9119c..4899378dc85bae38bc4860318d43fc0eb10a4b92 100644 (file)
--- a/Modules/main.c
+++ b/Modules/main.c
@@ -495,16 +495,13 @@ Py_Main(int argc, wchar_t **argv)
              /* Use utf-8 on Mac OS X */
              unicode = PyUnicode_FromString(p);
  #else
-            wchar_t *wchar;
-            size_t len;
-            wchar = _Py_char2wchar(p, &len);
-            if (wchar == NULL)
-                continue;
-            unicode = PyUnicode_FromWideChar(wchar, len);
-            PyMem_Free(wchar);
+            unicode = PyUnicode_DecodeLocale(p, 1);
  #endif
-            if (unicode == NULL)
+            if (unicode == NULL) {
+                /* ignore errors */
+                PyErr_Clear();
                  continue;
+            }
              PySys_AddWarnOptionUnicode(unicode);
              Py_DECREF(unicode);
          }
diff --git a/Modules/timemodule.c b/Modules/timemodule.c

index 001b311731b4bd9ae49ea1fe7d708ed2b18e7eae..a46c4f11e40c57a7b61e524a4f4388481df6419b 100644 (file)
--- a/Modules/timemodule.c
+++ b/Modules/timemodule.c
@@ -532,7 +532,7 @@ time_strftime(PyObject *self, PyObject *args)
  #ifdef HAVE_WCSFTIME
              ret = PyUnicode_FromWideChar(outbuf, buflen);
  #else
-            ret = PyUnicode_DecodeFSDefaultAndSize(outbuf, buflen);
+            ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, 1);
  #endif
              PyMem_Free(outbuf);
              break;
@@ -764,8 +764,8 @@ PyInit_timezone(PyObject *m) {
  #endif /* PYOS_OS2 */
  #endif
      PyModule_AddIntConstant(m, "daylight", daylight);
-    otz0 = PyUnicode_DecodeFSDefaultAndSize(tzname[0], strlen(tzname[0]));
-    otz1 = PyUnicode_DecodeFSDefaultAndSize(tzname[1], strlen(tzname[1]));
+    otz0 = PyUnicode_DecodeLocale(tzname[0], 1);
+    otz1 = PyUnicode_DecodeLocale(tzname[1], 1);
      PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1));
  #else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/
  #ifdef HAVE_STRUCT_TM_TM_ZONE
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 5758ffacf31aaedd9e7b766f0b3100383a0b96b8..7444c8b4ba0c120b8d9024a04cfaeb3298f689e7 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3234,6 +3234,83 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
      return NULL;
  }
  
+PyObject*
+PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
+                              int surrogateescape)
+{
+    wchar_t smallbuf[256];
+    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
+    wchar_t *wstr;
+    size_t wlen, wlen2;
+    PyObject *unicode;
+
+    if (str[len] != '\0' || len != strlen(str)) {
+        PyErr_SetString(PyExc_TypeError, "embedded null character");
+        return NULL;
+    }
+
+    if (surrogateescape)
+    {
+        wstr = _Py_char2wchar(str, &wlen);
+        if (wstr == NULL) {
+            if (wlen == (size_t)-1)
+                PyErr_NoMemory();
+            else
+                PyErr_SetFromErrno(PyExc_OSError);
+            return NULL;
+        }
+
+        unicode = PyUnicode_FromWideChar(wstr, wlen);
+        PyMem_Free(wstr);
+    }
+    else {
+#ifndef HAVE_BROKEN_MBSTOWCS
+        wlen = mbstowcs(NULL, str, 0);
+#else
+        wlen = len;
+#endif
+        if (wlen == (size_t)-1) {
+            PyErr_SetFromErrno(PyExc_OSError);
+            return NULL;
+        }
+        if (wlen+1 <= smallbuf_len) {
+            wstr = smallbuf;
+        }
+        else {
+            if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
+                return PyErr_NoMemory();
+
+            wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
+            if (!wstr)
+                return PyErr_NoMemory();
+        }
+
+        /* This shouldn't fail now */
+        wlen2 = mbstowcs(wstr, str, wlen+1);
+        if (wlen2 == (size_t)-1) {
+            if (wstr != smallbuf)
+                PyMem_Free(wstr);
+            PyErr_SetFromErrno(PyExc_OSError);
+            return NULL;
+        }
+#ifdef HAVE_BROKEN_MBSTOWCS
+        assert(wlen2 == wlen);
+#endif
+        unicode = PyUnicode_FromWideChar(wstr, wlen2);
+        if (wstr != smallbuf)
+            PyMem_Free(wstr);
+    }
+    return unicode;
+}
+
+PyObject*
+PyUnicode_DecodeLocale(const char *str, int surrogateescape)
+{
+    Py_ssize_t size = (Py_ssize_t)strlen(str);
+    return PyUnicode_DecodeLocaleAndSize(str, size, surrogateescape);
+}
+
+
  PyObject*
  PyUnicode_DecodeFSDefault(const char *s) {
      Py_ssize_t size = (Py_ssize_t)strlen(s);
@@ -3264,23 +3341,7 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
                                  "surrogateescape");
      }
      else {
-        /* locale encoding with surrogateescape */
-        wchar_t *wchar;
-        PyObject *unicode;
-        size_t len;
-
-        if (s[size] != '\0' || size != strlen(s)) {
-            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
-            return NULL;
-        }
-
-        wchar = _Py_char2wchar(s, &len);
-        if (wchar == NULL)
-            return PyErr_NoMemory();
-
-        unicode = PyUnicode_FromWideChar(wchar, len);
-        PyMem_Free(wchar);
-        return unicode;
+        return PyUnicode_DecodeLocaleAndSize(s, size, 1);
      }
  #endif
  }
diff --git a/Python/fileutils.c b/Python/fileutils.c

index 0afa415d59b3f5cc3f540eb10e990a9ea2d224f2..0aad2200fb1fe64fc06ac3079b3556e456b3f889 100644 (file)
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -16,7 +16,9 @@
     Return a pointer to a newly allocated wide character string (use
     PyMem_Free() to free the memory) and write the number of written wide
     characters excluding the null character into *size if size is not NULL, or
-   NULL on error (conversion or memory allocation error).
+   NULL on error (decoding or memory allocation error). If size is not NULL,
+   *size is set to (size_t)-1 on memory error and (size_t)-2 on decoding
+   error.
  
     Conversion errors should never happen, unless there is a bug in the C
     library. */
@@ -82,8 +84,9 @@ _Py_char2wchar(const char* arg, size_t *size)
                 since we provide everything that we have -
                 unless there is a bug in the C library, or I
                 misunderstood how mbrtowc works. */
-            fprintf(stderr, "unexpected mbrtowc result -2\n");
              PyMem_Free(res);
+            if (size != NULL)
+                *size = (size_t)-2;
              return NULL;
          }
          if (converted == (size_t)-1) {
@@ -112,7 +115,8 @@ _Py_char2wchar(const char* arg, size_t *size)
         is ASCII (i.e. escape all bytes > 128. This will still roundtrip
         correctly in the locale's charset, which must be an ASCII superset. */
      res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
-    if (!res) goto oom;
+    if (!res)
+        goto oom;
      in = (unsigned char*)arg;
      out = res;
      while(*in)
@@ -126,7 +130,8 @@ _Py_char2wchar(const char* arg, size_t *size)
          *size = out - res;
      return res;
  oom:
-    fprintf(stderr, "out of memory\n");
+    if (size != NULL)
+        *size = (size_t)-1;
      return NULL;
  }
  
@@ -137,10 +142,10 @@ oom:
     This function is the reverse of _Py_char2wchar().
  
     Return a pointer to a newly allocated byte string (use PyMem_Free() to free
-   the memory), or NULL on conversion or memory allocation error.
+   the memory), or NULL on encoding or memory allocation error.
  
     If error_pos is not NULL: *error_pos is the index of the invalid character
-   on conversion error, or (size_t)-1 otherwise. */
+   on encoding error, or (size_t)-1 otherwise. */
  char*
  _Py_wchar2char(const wchar_t *text, size_t *error_pos)
  {
@@ -328,7 +333,7 @@ _Py_fopen(PyObject *path, const char *mode)
  #ifdef HAVE_READLINK
  
  /* Read value of symbolic link. Encode the path to the locale encoding, decode
-   the result from the locale encoding. */
+   the result from the locale encoding. Return -1 on error. */
  
  int
  _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
@@ -372,7 +377,8 @@ _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
  #ifdef HAVE_REALPATH
  
  /* Return the canonicalized absolute pathname. Encode path to the locale
-   encoding, decode the result from the locale encoding. */
+   encoding, decode the result from the locale encoding.
+   Return NULL on error. */
  
  wchar_t*
  _Py_wrealpath(const wchar_t *path,
@@ -410,7 +416,8 @@ _Py_wrealpath(const wchar_t *path,
  #endif
  
  /* Get the current directory. size is the buffer size in wide characters
-   including the null character. Decode the path from the locale encoding. */
+   including the null character. Decode the path from the locale encoding.
+   Return NULL on error. */
  
  wchar_t*
  _Py_wgetcwd(wchar_t *buf, size_t size)
author	Victor Stinner <victor.stinner@haypocalc.com>
	Fri, 16 Dec 2011 22:56:01 +0000 (23:56 +0100)
committer	Victor Stinner <victor.stinner@haypocalc.com>
	Fri, 16 Dec 2011 22:56:01 +0000 (23:56 +0100)
Doc/c-api/unicode.rst		patch \| blob \| history
Include/unicodeobject.h		patch \| blob \| history
Modules/_localemodule.c		patch \| blob \| history
Modules/main.c		patch \| blob \| history
Modules/timemodule.c		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history
Python/fileutils.c		patch \| blob \| history