Add an errors parameter to open() and TextIOWrapper() to specify error handling.

author Guido van Rossum <guido@python.org>

Mon, 3 Dec 2007 22:54:21 +0000 (22:54 +0000)

committer Guido van Rossum <guido@python.org>

Mon, 3 Dec 2007 22:54:21 +0000 (22:54 +0000)
author Guido van Rossum <guido@python.org>
Mon, 3 Dec 2007 22:54:21 +0000 (22:54 +0000)
committer Guido van Rossum <guido@python.org>
Mon, 3 Dec 2007 22:54:21 +0000 (22:54 +0000)
diff --git a/Include/fileobject.h b/Include/fileobject.h

index 0f40089ef19624d0ee8be84fc010d4a48ee012be..00ec9befed68114b1f283aa83aec5013e7ecb331 100644 (file)
--- a/Include/fileobject.h
+++ b/Include/fileobject.h
@@ -9,7 +9,7 @@ extern "C" {
  #define PY_STDIOTEXTMODE "b"
  
  PyAPI_FUNC(PyObject *) PyFile_FromFd(int, char *, char *, int, char *, char *,
-                                    int);
+                                    char *, int);
  PyAPI_FUNC(PyObject *) PyFile_GetLine(PyObject *, int);
  PyAPI_FUNC(int) PyFile_WriteObject(PyObject *, PyObject *, int);
  PyAPI_FUNC(int) PyFile_WriteString(const char *, PyObject *);
diff --git a/Lib/io.py b/Lib/io.py

index ff039013254fce57df3b9ac435a1e6f53c3c86d8..a72e3fd7e3989253332802cf32100786284c7d76 100644 (file)
--- a/Lib/io.py
+++ b/Lib/io.py
@@ -49,8 +49,8 @@ class BlockingIOError(IOError):
          self.characters_written = characters_written
  
  
-def open(file, mode="r", buffering=None, encoding=None, newline=None,
-         closefd=True):
+def open(file, mode="r", buffering=None, encoding=None, errors=None,
+         newline=None, closefd=True):
      r"""Replacement for the built-in open function.
  
      Args:
@@ -61,6 +61,7 @@ def open(file, mode="r", buffering=None, encoding=None, newline=None,
                   can be: 0 = unbuffered, 1 = line buffered,
                   larger = fully buffered.
        encoding: optional string giving the text encoding.
+      errors: optional string giving the encoding error handling.
        newline: optional newlines specifier; must be None, '', '\n', '\r'
                 or '\r\n'; all other values are illegal.  It controls the
                 handling of line endings.  It works as follows:
@@ -99,7 +100,7 @@ def open(file, mode="r", buffering=None, encoding=None, newline=None,
        'U': universal newline mode (for backwards compatibility)
  
      Constraints:
-      - encoding must not be given when a binary mode is given
+      - encoding or errors must not be given when a binary mode is given
        - buffering must not be zero when a text mode is given
  
      Returns:
@@ -115,6 +116,8 @@ def open(file, mode="r", buffering=None, encoding=None, newline=None,
          raise TypeError("invalid buffering: %r" % buffering)
      if encoding is not None and not isinstance(encoding, str):
          raise TypeError("invalid encoding: %r" % encoding)
+    if errors is not None and not isinstance(errors, str):
+        raise TypeError("invalid errors: %r" % errors)
      modes = set(mode)
      if modes - set("arwb+tU") or len(mode) > len(modes):
          raise ValueError("invalid mode: %r" % mode)
@@ -136,6 +139,8 @@ def open(file, mode="r", buffering=None, encoding=None, newline=None,
          raise ValueError("must have exactly one of read/write/append mode")
      if binary and encoding is not None:
          raise ValueError("binary mode doesn't take an encoding argument")
+    if binary and errors is not None:
+        raise ValueError("binary mode doesn't take an errors argument")
      if binary and newline is not None:
          raise ValueError("binary mode doesn't take a newline argument")
      raw = FileIO(file,
@@ -177,7 +182,7 @@ def open(file, mode="r", buffering=None, encoding=None, newline=None,
          buffer.name = file
          buffer.mode = mode
          return buffer
-    text = TextIOWrapper(buffer, encoding, newline)
+    text = TextIOWrapper(buffer, encoding, errors, newline)
      text.name = file
      text.mode = mode
      return text
@@ -1128,7 +1133,7 @@ class TextIOWrapper(TextIOBase):
  
      _CHUNK_SIZE = 128
  
-    def __init__(self, buffer, encoding=None, newline=None):
+    def __init__(self, buffer, encoding=None, errors=None, newline=None):
          if newline not in (None, "", "\n", "\r", "\r\n"):
              raise ValueError("illegal newline value: %r" % (newline,))
          if encoding is None:
@@ -1148,8 +1153,15 @@ class TextIOWrapper(TextIOBase):
          if not isinstance(encoding, str):
              raise ValueError("invalid encoding: %r" % encoding)
  
+        if errors is None:
+            errors = "strict"
+        else:
+            if not isinstance(errors, str):
+                raise ValueError("invalid errors: %r" % errors)
+
          self.buffer = buffer
          self._encoding = encoding
+        self._errors = errors
          self._readuniversal = not newline
          self._readtranslate = newline is None
          self._readnl = newline
@@ -1164,6 +1176,10 @@ class TextIOWrapper(TextIOBase):
      def encoding(self):
          return self._encoding
  
+    @property
+    def errors(self):
+        return self._errors
+
      # A word about _snapshot.  This attribute is either None, or a
      # tuple (decoder_state, readahead, pending) where decoder_state is
      # the second (integer) item of the decoder state, readahead is the
@@ -1206,7 +1222,7 @@ class TextIOWrapper(TextIOBase):
          if haslf and self._writetranslate and self._writenl != "\n":
              s = s.replace("\n", self._writenl)
          # XXX What if we were just reading?
-        b = s.encode(self._encoding)
+        b = s.encode(self._encoding, self._errors)
          self.buffer.write(b)
          if haslf and self.isatty():
              self.flush()
@@ -1220,7 +1236,7 @@ class TextIOWrapper(TextIOBase):
          if make_decoder is None:
              raise IOError("Can't find an incremental decoder for encoding %s" %
                            self._encoding)
-        decoder = make_decoder()  # XXX: errors
+        decoder = make_decoder(self._errors)
          if self._readuniversal:
              decoder = IncrementalNewlineDecoder(decoder, self._readtranslate)
          self._decoder = decoder
@@ -1447,9 +1463,11 @@ class StringIO(TextIOWrapper):
  
      # XXX This is really slow, but fully functional
  
-    def __init__(self, initial_value="", encoding="utf-8", newline="\n"):
+    def __init__(self, initial_value="", encoding="utf-8",
+                 errors="strict", newline="\n"):
          super(StringIO, self).__init__(BytesIO(),
                                         encoding=encoding,
+                                       errors=errors,
                                         newline=newline)
          if initial_value:
              if not isinstance(initial_value, str):
@@ -1459,4 +1477,4 @@ class StringIO(TextIOWrapper):
  
      def getvalue(self):
          self.flush()
-        return self.buffer.getvalue().decode(self._encoding)
+        return self.buffer.getvalue().decode(self._encoding, self._errors)
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py

index 7ca3fbbd47528a84adc0af027ba084f2512e644e..36aaf14fc8cb1faddfea75ad68a07758aab279aa 100644 (file)
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -496,6 +496,46 @@ class TextIOWrapperTest(unittest.TestCase):
      def tearDown(self):
          test_support.unlink(test_support.TESTFN)
  
+    def testEncodingErrorsReading(self):
+        # (1) default
+        b = io.BytesIO(b"abc\n\xff\n")
+        t = io.TextIOWrapper(b, encoding="ascii")
+        self.assertRaises(UnicodeError, t.read)
+        # (2) explicit strict
+        b = io.BytesIO(b"abc\n\xff\n")
+        t = io.TextIOWrapper(b, encoding="ascii", errors="strict")
+        self.assertRaises(UnicodeError, t.read)
+        # (3) ignore
+        b = io.BytesIO(b"abc\n\xff\n")
+        t = io.TextIOWrapper(b, encoding="ascii", errors="ignore")
+        self.assertEquals(t.read(), "abc\n\n")
+        # (4) replace
+        b = io.BytesIO(b"abc\n\xff\n")
+        t = io.TextIOWrapper(b, encoding="ascii", errors="replace")
+        self.assertEquals(t.read(), "abc\n\ufffd\n")
+
+    def testEncodingErrorsWriting(self):
+        # (1) default
+        b = io.BytesIO()
+        t = io.TextIOWrapper(b, encoding="ascii")
+        self.assertRaises(UnicodeError, t.write, "\xff")
+        # (2) explicit strict
+        b = io.BytesIO()
+        t = io.TextIOWrapper(b, encoding="ascii", errors="strict")
+        self.assertRaises(UnicodeError, t.write, "\xff")
+        # (3) ignore
+        b = io.BytesIO()
+        t = io.TextIOWrapper(b, encoding="ascii", errors="ignore")
+        t.write("abc\xffdef\n")
+        t.flush()
+        self.assertEquals(b.getvalue(), b"abcdef\n")
+        # (4) replace
+        b = io.BytesIO()
+        t = io.TextIOWrapper(b, encoding="ascii", errors="replace")
+        t.write("abc\xffdef\n")
+        t.flush()
+        self.assertEquals(b.getvalue(), b"abc?def\n")
+
      def testNewlinesInput(self):
          testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG"
          normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
diff --git a/Objects/complexobject.c b/Objects/complexobject.c

index 458d0baf82f64eec553dc82bea1c9f23c6e937cd..de4641cc206d5e593ddbfa9f4a69fc498e07d5cb 100644 (file)
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@@ -915,6 +915,7 @@ complex_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
                         return NULL;
                 }
                 cr.real = PyFloat_AsDouble(tmp);
+                cr.imag = 0.0; /* Shut up compiler warning */
                 Py_DECREF(tmp);
         }
         if (i == NULL) {
diff --git a/Objects/fileobject.c b/Objects/fileobject.c

index f740977858dc06a594a6348d5f52eba5f28279fc..9b3ff3e28daacb01ffe9faa3ece61bc0ddfa2ee2 100644 (file)
--- a/Objects/fileobject.c
+++ b/Objects/fileobject.c
@@ -27,15 +27,16 @@ extern "C" {
  
  PyObject *
  PyFile_FromFd(int fd, char *name, char *mode, int buffering, char *encoding,
-             char *newline, int closefd)
+             char *errors, char *newline, int closefd)
  {
         PyObject *io, *stream, *nameobj = NULL;
  
         io = PyImport_ImportModule("io");
         if (io == NULL)
                 return NULL;
-       stream = PyObject_CallMethod(io, "open", "isissi", fd, mode,
-                                    buffering, encoding, newline, closefd);
+       stream = PyObject_CallMethod(io, "open", "isisssi", fd, mode,
+                                    buffering, encoding, errors,
+                                    newline, closefd);
         Py_DECREF(io);
         if (stream == NULL)
                 return NULL;
diff --git a/Python/import.c b/Python/import.c

index 221c2dd2d230bca5f502c66361f17b4a1b27820a..b7a97521683cd127596a7d7ce2083a2c5cef35c1 100644 (file)
--- a/Python/import.c
+++ b/Python/import.c
@@ -2602,7 +2602,7 @@ call_find_module(char *name, PyObject *path)
                                    (char*)PyUnicode_GetDefaultEncoding();
                 }
                 fob = PyFile_FromFd(fd, pathname, fdp->mode, -1,
-                                   (char*)encoding, NULL, 1);
+                                   (char*)encoding, NULL, NULL, 1);
                 if (fob == NULL) {
                         close(fd);
                         PyMem_FREE(found_encoding);
diff --git a/Python/pythonrun.c b/Python/pythonrun.c

index f46b90e8ac846b8eec42c843dec48ab4612a520f..14fe7835ee2003db0d6043372aeb401e94893e71 100644 (file)
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -770,7 +770,7 @@ initstdio(void)
  #endif
         }
         else {
-               if (!(std = PyFile_FromFd(fd, "<stdin>", "r", -1, NULL, 
+               if (!(std = PyFile_FromFd(fd, "<stdin>", "r", -1, NULL, NULL,
                                           "\n", 0))) {
                         goto error;
                 }
@@ -790,7 +790,7 @@ initstdio(void)
  #endif
         }
         else {
-               if (!(std = PyFile_FromFd(fd, "<stdout>", "w", -1, NULL, 
+               if (!(std = PyFile_FromFd(fd, "<stdout>", "w", -1, NULL, NULL,
                                           "\n", 0))) {
                         goto error;
                 }
@@ -811,7 +811,7 @@ initstdio(void)
  #endif
         }
         else {
-               if (!(std = PyFile_FromFd(fd, "<stderr>", "w", -1, NULL, 
+               if (!(std = PyFile_FromFd(fd, "<stderr>", "w", -1, NULL, NULL,
                                           "\n", 0))) {
                         goto error;
                 }
author	Guido van Rossum <guido@python.org>
	Mon, 3 Dec 2007 22:54:21 +0000 (22:54 +0000)
committer	Guido van Rossum <guido@python.org>
	Mon, 3 Dec 2007 22:54:21 +0000 (22:54 +0000)
Include/fileobject.h		patch \| blob \| history
Lib/io.py		patch \| blob \| history
Lib/test/test_io.py		patch \| blob \| history
Objects/complexobject.c		patch \| blob \| history
Objects/fileobject.c		patch \| blob \| history
Python/import.c		patch \| blob \| history
Python/pythonrun.c		patch \| blob \| history