Issue #850997: mbcs encoding (Windows only) handles errors argument: strict

author Victor Stinner <victor.stinner@haypocalc.com>

Wed, 16 Jun 2010 23:33:54 +0000 (23:33 +0000)

committer Victor Stinner <victor.stinner@haypocalc.com>

Wed, 16 Jun 2010 23:33:54 +0000 (23:33 +0000)
author Victor Stinner <victor.stinner@haypocalc.com>
Wed, 16 Jun 2010 23:33:54 +0000 (23:33 +0000)
committer Victor Stinner <victor.stinner@haypocalc.com>
Wed, 16 Jun 2010 23:33:54 +0000 (23:33 +0000)
diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst

index 13e86a24fe25687b649636f0da243dcb1f0abfdb..853cc785f03aceff9a7932a734fced0dcf8745ef 100644 (file)
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -1223,6 +1223,23 @@ functions can be used directly if desired.
     Convert a label to Unicode, as specified in :rfc:`3490`.
  
  
+:mod:`encodings.mbcs` --- Windows ANSI codepage
+-----------------------------------------------
+
+.. module:: encodings.mbcs
+   :synopsis: Windows ANSI codepage
+
+Encode operand according to the ANSI codepage (CP_ACP). This codec only
+supports ``'strict'`` and ``'replace'`` error handlers to encode, and
+``'strict'`` and ``'ignore'`` error handlers to decode.
+
+Availability: Windows only.
+
+.. versionchanged:: 3.2
+   Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
+   to encode, and ``'ignore'`` to decode.
+
+
  :mod:`encodings.utf_8_sig` --- UTF-8 codec with BOM signature
  -------------------------------------------------------------
  
diff --git a/Lib/ctypes/__init__.py b/Lib/ctypes/__init__.py

index 8782db9c16febe25e537b603980940b24bea0ee8..ce1d779b42066461a9d90d7c6928044a62fe2eb5 100644 (file)
--- a/Lib/ctypes/__init__.py
+++ b/Lib/ctypes/__init__.py
@@ -265,7 +265,7 @@ except ImportError:
      pass
  else:
      if _os.name in ("nt", "ce"):
-        set_conversion_mode("mbcs", "ignore")
+        set_conversion_mode("mbcs", "strict")
      else:
          set_conversion_mode("ascii", "strict")
  
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index 911d58f8f674d8763e719eb090044e0917fbba99..521cbce35b480fcd8f686d278bc03c05bc4e05a3 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1358,11 +1358,6 @@ broken_incremental_coders = broken_unicode_with_streams + [
      "idna",
  ]
  
-# The following encodings only support "strict" mode
-only_strict_mode = [
-    "idna",
-]
-
  class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
      def test_basics(self):
          s = "abc123" # all codecs should be able to encode these
@@ -1437,7 +1432,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
                      result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
                      self.assertEqual(result, "")
  
-                if encoding not in only_strict_mode:
+                if encoding not in ("idna", "mbcs"):
                      # check incremental decoder/encoder with errors argument
                      try:
                          encoder = codecs.getincrementalencoder(encoding)("ignore")
diff --git a/Misc/NEWS b/Misc/NEWS

index 7817276b8ae7718b466d1b8e492f3ced76684ce5..e56711ccb60c724f5440b3d8b5cd93af6990f294 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,11 @@ What's New in Python 3.2 Alpha 1?
  Core and Builtins
  -----------------
  
+- Issue #850997: mbcs encoding (Windows only) handles errors argument: strict
+  mode raises unicode errors. The encoder only supports "strict" and "replace"
+  error handlers, the decoder only supports "strict" and "ignore" error
+  handlers.
+
  - Issue #8592: PyArg_Parse*() functions raise a TypeError for "y", "u" and "Z"
    formats if the string contains a null byte/character. Write unit tests for
    string formats.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 4153c25f53d657a123b41ccc10dec4e035519b30..83e036099f6a09cf2c60f7c510020d735b6c4715 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1767,6 +1767,33 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
      return 0;
  }
  
+/* create or adjust a UnicodeDecodeError */
+static void
+make_decode_exception(PyObject **exceptionObject,
+                      const char *encoding,
+                      const char *input, Py_ssize_t length,
+                      Py_ssize_t startpos, Py_ssize_t endpos,
+                      const char *reason)
+{
+    if (*exceptionObject == NULL) {
+        *exceptionObject = PyUnicodeDecodeError_Create(
+            encoding, input, length, startpos, endpos, reason);
+    }
+    else {
+        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
+            goto onError;
+        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
+            goto onError;
+        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
+            goto onError;
+    }
+    return;
+
+onError:
+    Py_DECREF(*exceptionObject);
+    *exceptionObject = NULL;
+}
+
  /* error handling callback helper:
     build arguments, call the callback and check the arguments,
     if no exception occurred, copy the replacement to the output
@@ -1800,20 +1827,13 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
              goto onError;
      }
  
-    if (*exceptionObject == NULL) {
-        *exceptionObject = PyUnicodeDecodeError_Create(
-            encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
-        if (*exceptionObject == NULL)
-            goto onError;
-    }
-    else {
-        if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
-            goto onError;
-        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
-            goto onError;
-        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
-            goto onError;
-    }
+    make_decode_exception(exceptionObject,
+        encoding,
+        *input, *inend - *input,
+        *startinpos, *endinpos,
+        reason);
+    if (*exceptionObject == NULL)
+        goto onError;
  
      restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
      if (restuple == NULL)
@@ -4552,32 +4572,46 @@ static int is_dbcs_lead_byte(const char *s, int offset)
  static int decode_mbcs(PyUnicodeObject **v,
                         const char *s, /* MBCS string */
                         int size, /* sizeof MBCS string */
-                       int final)
+                       int final,
+                       const char *errors)
  {
      Py_UNICODE *p;
-    Py_ssize_t n = 0;
-    int usize = 0;
+    Py_ssize_t n;
+    DWORD usize;
+    DWORD flags;
  
      assert(size >= 0);
  
+    /* check and handle 'errors' arg */
+    if (errors==NULL || strcmp(errors, "strict")==0)
+        flags = MB_ERR_INVALID_CHARS;
+    else if (strcmp(errors, "ignore")==0)
+        flags = 0;
+    else {
+        PyErr_Format(PyExc_ValueError,
+                     "mbcs encoding does not support errors='%s'",
+                     errors);
+        return -1;
+    }
+
      /* Skip trailing lead-byte unless 'final' is set */
      if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
          --size;
  
      /* First get the size of the result */
      if (size > 0) {
-        usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
-        if (usize == 0) {
-            PyErr_SetFromWindowsErrWithFilename(0, NULL);
-            return -1;
-        }
-    }
+        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
+        if (usize==0)
+            goto mbcs_decode_error;
+    } else
+        usize = 0;
  
      if (*v == NULL) {
          /* Create unicode object */
          *v = _PyUnicode_New(usize);
          if (*v == NULL)
              return -1;
+        n = 0;
      }
      else {
          /* Extend unicode object */
@@ -4587,15 +4621,35 @@ static int decode_mbcs(PyUnicodeObject **v,
      }
  
      /* Do the conversion */
-    if (size > 0) {
+    if (usize > 0) {
          p = PyUnicode_AS_UNICODE(*v) + n;
-        if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
-            PyErr_SetFromWindowsErrWithFilename(0, NULL);
-            return -1;
+        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
+            goto mbcs_decode_error;
          }
      }
-
      return size;
+
+mbcs_decode_error:
+    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
+       we raise a UnicodeDecodeError - else it is a 'generic'
+       windows error
+     */
+    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
+        /* Ideally, we should get reason from FormatMessage - this
+           is the Windows 2000 English version of the message
+        */
+        PyObject *exc = NULL;
+        const char *reason = "No mapping for the Unicode character exists "
+                             "in the target multi-byte code page.";
+        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
+        if (exc != NULL) {
+            PyCodec_StrictErrors(exc);
+            Py_DECREF(exc);
+        }
+    } else {
+        PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    }
+    return -1;
  }
  
  PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
@@ -4612,10 +4666,10 @@ PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
  #ifdef NEED_RETRY
    retry:
      if (size > INT_MAX)
-        done = decode_mbcs(&v, s, INT_MAX, 0);
+        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
      else
  #endif
-        done = decode_mbcs(&v, s, (int)size, !consumed);
+        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
  
      if (done < 0) {
          Py_XDECREF(v);
@@ -4649,20 +4703,45 @@ PyObject *PyUnicode_DecodeMBCS(const char *s,
   */
  static int encode_mbcs(PyObject **repr,
                         const Py_UNICODE *p, /* unicode */
-                       int size) /* size of unicode */
+                       int size, /* size of unicode */
+                       const char* errors)
  {
-    int mbcssize = 0;
-    Py_ssize_t n = 0;
+    BOOL usedDefaultChar = FALSE;
+    BOOL *pusedDefaultChar;
+    int mbcssize;
+    Py_ssize_t n;
+    PyObject *exc = NULL;
+    DWORD flags;
  
      assert(size >= 0);
  
+    /* check and handle 'errors' arg */
+    if (errors==NULL || strcmp(errors, "strict")==0) {
+        flags = WC_NO_BEST_FIT_CHARS;
+        pusedDefaultChar = &usedDefaultChar;
+    } else if (strcmp(errors, "replace")==0) {
+        flags = 0;
+        pusedDefaultChar = NULL;
+    } else {
+         PyErr_Format(PyExc_ValueError,
+                      "mbcs encoding does not support errors='%s'",
+                      errors);
+         return -1;
+    }
+
      /* First get the size of the result */
      if (size > 0) {
-        mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
+        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
+                                       NULL, pusedDefaultChar);
          if (mbcssize == 0) {
              PyErr_SetFromWindowsErrWithFilename(0, NULL);
              return -1;
          }
+        /* If we used a default char, then we failed! */
+        if (pusedDefaultChar && *pusedDefaultChar)
+            goto mbcs_encode_error;
+    } else {
+        mbcssize = 0;
      }
  
      if (*repr == NULL) {
@@ -4670,6 +4749,7 @@ static int encode_mbcs(PyObject **repr,
          *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
          if (*repr == NULL)
              return -1;
+        n = 0;
      }
      else {
          /* Extend string object */
@@ -4681,13 +4761,20 @@ static int encode_mbcs(PyObject **repr,
      /* Do the conversion */
      if (size > 0) {
          char *s = PyBytes_AS_STRING(*repr) + n;
-        if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
+        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
+                                     NULL, pusedDefaultChar)) {
              PyErr_SetFromWindowsErrWithFilename(0, NULL);
              return -1;
          }
+        if (pusedDefaultChar && *pusedDefaultChar)
+            goto mbcs_encode_error;
      }
-
      return 0;
+
+mbcs_encode_error:
+    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
+    Py_XDECREF(exc);
+    return -1;
  }
  
  PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
@@ -4700,10 +4787,10 @@ PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
  #ifdef NEED_RETRY
    retry:
      if (size > INT_MAX)
-        ret = encode_mbcs(&repr, p, INT_MAX);
+        ret = encode_mbcs(&repr, p, INT_MAX, errors);
      else
  #endif
-        ret = encode_mbcs(&repr, p, (int)size);
+        ret = encode_mbcs(&repr, p, (int)size, errors);
  
      if (ret < 0) {
          Py_XDECREF(repr);
author	Victor Stinner <victor.stinner@haypocalc.com>
	Wed, 16 Jun 2010 23:33:54 +0000 (23:33 +0000)
committer	Victor Stinner <victor.stinner@haypocalc.com>
	Wed, 16 Jun 2010 23:33:54 +0000 (23:33 +0000)
Doc/library/codecs.rst		patch \| blob \| history
Lib/ctypes/__init__.py		patch \| blob \| history
Lib/test/test_codecs.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history