Issue #8633: Support for POSIX.1-2008 binary pax headers.

author Lars Gustäbel <lars@gustaebel.de>

Mon, 17 May 2010 18:02:50 +0000 (18:02 +0000)

committer Lars Gustäbel <lars@gustaebel.de>

Mon, 17 May 2010 18:02:50 +0000 (18:02 +0000)
author Lars Gustäbel <lars@gustaebel.de>
Mon, 17 May 2010 18:02:50 +0000 (18:02 +0000)
committer Lars Gustäbel <lars@gustaebel.de>
Mon, 17 May 2010 18:02:50 +0000 (18:02 +0000)
diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst

index 8f68c42e08d187fbc11ade527a915b9d04238b43..c2a9143702c8f5eb91b6c72af02a3637bba42cc6 100644 (file)
--- a/Doc/library/tarfile.rst
+++ b/Doc/library/tarfile.rst
@@ -711,6 +711,8 @@ converted. Possible values are listed in section :ref:`codec-base-classes`.
  The default scheme is ``'surrogateescape'`` which Python also uses for its
  file system calls, see :ref:`os-filenames`.
  
-In case of writing :const:`PAX_FORMAT` archives, *encoding* is ignored because
-non-ASCII metadata is stored using *UTF-8*. Storing surrogate characters is not
-possible and will raise a :exc:`UnicodeEncodeError`.
+In case of :const:`PAX_FORMAT` archives, *encoding* is generally not needed
+because all the metadata is stored using *UTF-8*. *encoding* is only used in
+the rare cases when binary pax headers are decoded or when strings with
+surrogate characters are stored.
+
diff --git a/Lib/tarfile.py b/Lib/tarfile.py

index 81b13a678f7df9da5722f7e42fd08082fd25fcfd..e9480c82d5e55346c14e7a62b7b58d29a1ac4bee 100644 (file)
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -118,6 +118,9 @@ GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
  PAX_FIELDS = ("path", "linkpath", "size", "mtime",
                "uid", "gid", "uname", "gname")
  
+# Fields from a pax header that are affected by hdrcharset.
+PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
+
  # Fields in a pax header that are numbers, all other fields
  # are treated as strings.
  PAX_NUMBER_FIELDS = {
@@ -988,7 +991,7 @@ class TarInfo(object):
          elif format == GNU_FORMAT:
              return self.create_gnu_header(info, encoding, errors)
          elif format == PAX_FORMAT:
-            return self.create_pax_header(info)
+            return self.create_pax_header(info, encoding)
          else:
              raise ValueError("invalid format")
  
@@ -1019,7 +1022,7 @@ class TarInfo(object):
  
          return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
  
-    def create_pax_header(self, info):
+    def create_pax_header(self, info, encoding):
          """Return the object as a ustar header block. If it cannot be
             represented this way, prepend a pax extended header sequence
             with supplement information.
@@ -1062,7 +1065,7 @@ class TarInfo(object):
  
          # Create a pax extended header if necessary.
          if pax_headers:
-            buf = self._create_pax_generic_header(pax_headers, XHDTYPE)
+            buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
          else:
              buf = b""
  
@@ -1072,7 +1075,7 @@ class TarInfo(object):
      def create_pax_global_header(cls, pax_headers):
          """Return the object as a pax global header block sequence.
          """
-        return cls._create_pax_generic_header(pax_headers, XGLTYPE)
+        return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf8")
  
      def _posix_split_name(self, name):
          """Split a name longer than 100 chars into a prefix
@@ -1145,15 +1148,35 @@ class TarInfo(object):
                  cls._create_payload(name)
  
      @classmethod
-    def _create_pax_generic_header(cls, pax_headers, type):
-        """Return a POSIX.1-2001 extended or global header sequence
+    def _create_pax_generic_header(cls, pax_headers, type, encoding):
+        """Return a POSIX.1-2008 extended or global header sequence
             that contains a list of keyword, value pairs. The values
             must be strings.
          """
+        # Check if one of the fields contains surrogate characters and thereby
+        # forces hdrcharset=BINARY, see _proc_pax() for more information.
+        binary = False
+        for keyword, value in pax_headers.items():
+            try:
+                value.encode("utf8", "strict")
+            except UnicodeEncodeError:
+                binary = True
+                break
+
          records = b""
+        if binary:
+            # Put the hdrcharset field at the beginning of the header.
+            records += b"21 hdrcharset=BINARY\n"
+
          for keyword, value in pax_headers.items():
              keyword = keyword.encode("utf8")
-            value = value.encode("utf8")
+            if binary:
+                # Try to restore the original byte representation of `value'.
+                # Needless to say, that the encoding must match the string.
+                value = value.encode(encoding, "surrogateescape")
+            else:
+                value = value.encode("utf8")
+
              l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
              n = p = 0
              while True:
@@ -1354,7 +1377,7 @@ class TarInfo(object):
  
      def _proc_pax(self, tarfile):
          """Process an extended or global header as described in
-           POSIX.1-2001.
+           POSIX.1-2008.
          """
          # Read the header information.
          buf = tarfile.fileobj.read(self._block(self.size))
@@ -1367,6 +1390,24 @@ class TarInfo(object):
          else:
              pax_headers = tarfile.pax_headers.copy()
  
+        # Check if the pax header contains a hdrcharset field. This tells us
+        # the encoding of the path, linkpath, uname and gname fields. Normally,
+        # these fields are UTF-8 encoded but since POSIX.1-2008 tar
+        # implementations are allowed to store them as raw binary strings if
+        # the translation to UTF-8 fails.
+        match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
+        if match is not None:
+            pax_headers["hdrcharset"] = match.group(1).decode("utf8")
+
+        # For the time being, we don't care about anything other than "BINARY".
+        # The only other value that is currently allowed by the standard is
+        # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
+        hdrcharset = pax_headers.get("hdrcharset")
+        if hdrcharset == "BINARY":
+            encoding = tarfile.encoding
+        else:
+            encoding = "utf8"
+
          # Parse pax header information. A record looks like that:
          # "%d %s=%s\n" % (length, keyword, value). length is the size
          # of the complete record including the length field itself and
@@ -1382,8 +1423,21 @@ class TarInfo(object):
              length = int(length)
              value = buf[match.end(2) + 1:match.start(1) + length - 1]
  
-            keyword = keyword.decode("utf8")
-            value = value.decode("utf8")
+            # Normally, we could just use "utf8" as the encoding and "strict"
+            # as the error handler, but we better not take the risk. For
+            # example, GNU tar <= 1.23 is known to store filenames it cannot
+            # translate to UTF-8 as raw strings (unfortunately without a
+            # hdrcharset=BINARY header).
+            # We first try the strict standard encoding, and if that fails we
+            # fall back on the user's encoding and error handler.
+            keyword = self._decode_pax_field(keyword, "utf8", "utf8",
+                    tarfile.errors)
+            if keyword in PAX_NAME_FIELDS:
+                value = self._decode_pax_field(value, encoding, tarfile.encoding,
+                        tarfile.errors)
+            else:
+                value = self._decode_pax_field(value, "utf8", "utf8",
+                        tarfile.errors)
  
              pax_headers[keyword] = value
              pos += length
@@ -1431,6 +1485,14 @@ class TarInfo(object):
  
          self.pax_headers = pax_headers.copy()
  
+    def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
+        """Decode a single field from a pax record.
+        """
+        try:
+            return value.decode(encoding, "strict")
+        except UnicodeDecodeError:
+            return value.decode(fallback_encoding, fallback_errors)
+
      def _block(self, count):
          """Round up a byte count by BLOCKSIZE and return it,
             e.g. _block(834) => 1024.
diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py

index 2db18fe8b6c615ec55ec2f01bcd929a352b425ce..c0741ee5e5fbee6c72d8a29a8ad340852f272525 100644 (file)
--- a/Lib/test/test_tarfile.py
+++ b/Lib/test/test_tarfile.py
@@ -1126,11 +1126,32 @@ class GNUUnicodeTest(UstarUnicodeTest):
  
      format = tarfile.GNU_FORMAT
  
+    def test_bad_pax_header(self):
+        # Test for issue #8633. GNU tar <= 1.23 creates raw binary fields
+        # without a hdrcharset=BINARY header.
+        for encoding, name in (("utf8", "pax/bad-pax-\udce4\udcf6\udcfc"),
+                ("iso8859-1", "pax/bad-pax-\xe4\xf6\xfc"),):
+            with tarfile.open(tarname, encoding=encoding, errors="surrogateescape") as tar:
+                try:
+                    t = tar.getmember(name)
+                except KeyError:
+                    self.fail("unable to read bad GNU tar pax header")
+
  
  class PAXUnicodeTest(UstarUnicodeTest):
  
      format = tarfile.PAX_FORMAT
  
+    def test_binary_header(self):
+        # Test a POSIX.1-2008 compatible header with a hdrcharset=BINARY field.
+        for encoding, name in (("utf8", "pax/hdrcharset-\udce4\udcf6\udcfc"),
+                ("iso8859-1", "pax/hdrcharset-\xe4\xf6\xfc"),):
+            with tarfile.open(tarname, encoding=encoding, errors="surrogateescape") as tar:
+                try:
+                    t = tar.getmember(name)
+                except KeyError:
+                    self.fail("unable to read POSIX.1-2008 binary header")
+
  
  class AppendTest(unittest.TestCase):
      # Test append mode (cp. patch #1652681).
diff --git a/Lib/test/testtar.tar b/Lib/test/testtar.tar

index b5bb46b16e682eb196c2eb4e7b9d3f69db916928..c3022ed196e37717fea75efd90890b8f96968588 100644 (file)

Binary files a/Lib/test/testtar.tar and b/Lib/test/testtar.tar differ
diff --git a/Misc/NEWS b/Misc/NEWS

index 1df122b0aa19f31b645d41bf37910064815dcc1c..ee19f3618580ac1c461f96c09e23d76a42c44955 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -366,6 +366,9 @@ C-API
  Library
  -------
  
+- Issue #8633: tarfile is now able to read and write archives with "raw" binary
+  pax headers as described in POSIX.1-2008.
+
  - Issue #1285086: Speed up urllib.parse functions: quote, quote_from_bytes,
    unquote, unquote_to_bytes.
author	Lars Gustäbel <lars@gustaebel.de>
	Mon, 17 May 2010 18:02:50 +0000 (18:02 +0000)
committer	Lars Gustäbel <lars@gustaebel.de>
	Mon, 17 May 2010 18:02:50 +0000 (18:02 +0000)
Doc/library/tarfile.rst		patch \| blob \| history
Lib/tarfile.py		patch \| blob \| history
Lib/test/test_tarfile.py		patch \| blob \| history
Lib/test/testtar.tar		patch \| blob \| history
Misc/NEWS		patch \| blob \| history