Issue #8390: tarfile uses surrogateespace as the default error handler

author Victor Stinner <victor.stinner@haypocalc.com>

Wed, 5 May 2010 21:43:57 +0000 (21:43 +0000)

committer Victor Stinner <victor.stinner@haypocalc.com>

Wed, 5 May 2010 21:43:57 +0000 (21:43 +0000)
author Victor Stinner <victor.stinner@haypocalc.com>
Wed, 5 May 2010 21:43:57 +0000 (21:43 +0000)
committer Victor Stinner <victor.stinner@haypocalc.com>
Wed, 5 May 2010 21:43:57 +0000 (21:43 +0000)
diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst

index 50a5148606e6b33149614e805e9c6932fc42afa0..8f68c42e08d187fbc11ade527a915b9d04238b43 100644 (file)
--- a/Doc/library/tarfile.rst
+++ b/Doc/library/tarfile.rst
@@ -218,7 +218,7 @@ be finalized; only the internally used file object will be closed. See the
  .. versionadded:: 3.2
     Added support for the context manager protocol.
  
-.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors=None, pax_headers=None, debug=0, errorlevel=0)
+.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors='surrogateescape', pax_headers=None, debug=0, errorlevel=0)
  
     All following arguments are optional and can be accessed as instance attributes
     as well.
@@ -267,6 +267,9 @@ be finalized; only the internally used file object will be closed. See the
     to be handled. The default settings will work for most users.
     See section :ref:`tar-unicode` for in-depth information.
  
+   .. versionchanged:: 3.2
+      Use ``'surrogateescape'`` as the default for the *errors* argument.
+
     The *pax_headers* argument is an optional dictionary of strings which
     will be added as a pax global header if *format* is :const:`PAX_FORMAT`.
  
@@ -449,11 +452,14 @@ It does *not* contain the file's data itself.
     a :class:`TarInfo` object.
  
  
-.. method:: TarInfo.tobuf(format=DEFAULT_FORMAT, encoding=ENCODING, errors='strict')
+.. method:: TarInfo.tobuf(format=DEFAULT_FORMAT, encoding=ENCODING, errors='surrogateescape')
  
     Create a string buffer from a :class:`TarInfo` object. For information on the
     arguments see the constructor of the :class:`TarFile` class.
  
+   .. versionchanged:: 3.2
+      Use ``'surrogateescape'`` as the default for the *errors* argument.
+
  
  A ``TarInfo`` object has the following public data attributes:
  
@@ -701,11 +707,10 @@ metadata must be either decoded or encoded. If *encoding* is not set
  appropriately, this conversion may fail.
  
  The *errors* argument defines how characters are treated that cannot be
-converted. Possible values are listed in section :ref:`codec-base-classes`. In
-read mode the default scheme is ``'replace'``. This avoids unexpected
-:exc:`UnicodeError` exceptions and guarantees that an archive can always be
-read. In write mode the default value for *errors* is ``'strict'``.  This
-ensures that name information is not altered unnoticed.
+converted. Possible values are listed in section :ref:`codec-base-classes`.
+The default scheme is ``'surrogateescape'`` which Python also uses for its
+file system calls, see :ref:`os-filenames`.
  
  In case of writing :const:`PAX_FORMAT` archives, *encoding* is ignored because
-non-ASCII metadata is stored using *UTF-8*.
+non-ASCII metadata is stored using *UTF-8*. Storing surrogate characters is not
+possible and will raise a :exc:`UnicodeEncodeError`.
diff --git a/Lib/tarfile.py b/Lib/tarfile.py

index a888d692dea230c8aab82671c36c7240fdef8d88..81b13a678f7df9da5722f7e42fd08082fd25fcfd 100644 (file)
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -978,7 +978,7 @@ class TarInfo(object):
  
          return info
  
-    def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="strict"):
+    def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
          """Return a tar header as a string of 512 byte blocks.
          """
          info = self.get_info()
@@ -1490,7 +1490,7 @@ class TarFile(object):
  
      def __init__(self, name=None, mode="r", fileobj=None, format=None,
              tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
-            errors=None, pax_headers=None, debug=None, errorlevel=None):
+            errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
          """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
             read from an existing archive, 'a' to append data to an existing
             file or 'w' to create a new file overwriting an existing one. `mode'
@@ -1531,13 +1531,7 @@ class TarFile(object):
              self.ignore_zeros = ignore_zeros
          if encoding is not None:
              self.encoding = encoding
-
-        if errors is not None:
-            self.errors = errors
-        elif mode == "r":
-            self.errors = "replace"
-        else:
-            self.errors = "strict"
+        self.errors = errors
  
          if pax_headers is not None and self.format == PAX_FORMAT:
              self.pax_headers = pax_headers
diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py

index 43527ef32bfa50147b40e5c0f96f3022d7548d79..2db18fe8b6c615ec55ec2f01bcd929a352b425ce 100644 (file)
--- a/Lib/test/test_tarfile.py
+++ b/Lib/test/test_tarfile.py
@@ -1118,8 +1118,8 @@ class UstarUnicodeTest(unittest.TestCase):
          if self.format != tarfile.PAX_FORMAT:
              tar = tarfile.open(tmpname, encoding="ascii")
              t = tar.getmember("foo")
-            self.assertEqual(t.uname, "\ufffd\ufffd\ufffd")
-            self.assertEqual(t.gname, "\ufffd\ufffd\ufffd")
+            self.assertEqual(t.uname, "\udce4\udcf6\udcfc")
+            self.assertEqual(t.gname, "\udce4\udcf6\udcfc")
  
  
  class GNUUnicodeTest(UstarUnicodeTest):
diff --git a/Misc/NEWS b/Misc/NEWS

index f3d27e10bad4514100c0e346de7b22f4bdbffa99..2b76def199d82971df2e718e3e06d284472590dd 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -348,6 +348,9 @@ C-API
  Library
  -------
  
+- Issue #8390: tarfile uses surrogateespace as the default error handler
+  (instead of replace in read mode or strict in write mode)
+
  - Issue #7755: Use an unencumbered audio file for tests.
  
  - Issue #8621: uuid.uuid4() returned the same sequence of values in the
author	Victor Stinner <victor.stinner@haypocalc.com>
	Wed, 5 May 2010 21:43:57 +0000 (21:43 +0000)
committer	Victor Stinner <victor.stinner@haypocalc.com>
	Wed, 5 May 2010 21:43:57 +0000 (21:43 +0000)
Doc/library/tarfile.rst		patch \| blob \| history
Lib/tarfile.py		patch \| blob \| history
Lib/test/test_tarfile.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history