#23745: handle duplicate MIME parameter names in new parser.

author R David Murray <rdmurray@bitdance.com>

Mon, 30 Mar 2015 01:53:05 +0000 (21:53 -0400)

committer R David Murray <rdmurray@bitdance.com>

Mon, 30 Mar 2015 01:53:05 +0000 (21:53 -0400)
author R David Murray <rdmurray@bitdance.com>
Mon, 30 Mar 2015 01:53:05 +0000 (21:53 -0400)
committer R David Murray <rdmurray@bitdance.com>
Mon, 30 Mar 2015 01:53:05 +0000 (21:53 -0400)
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index 1806cac9053a1573a59b9be513583fd3ba16e551..a9bdf4458b62a5df64a81c71752bdf26bca74973 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -71,6 +71,7 @@ import re
  import urllib   # For urllib.parse.unquote
  from string import hexdigits
  from collections import OrderedDict
+from operator import itemgetter
  from email import _encoded_words as _ew
  from email import errors
  from email import utils
@@ -1098,15 +1099,34 @@ class MimeParameters(TokenList):
                  params[name] = []
              params[name].append((token.section_number, token))
          for name, parts in params.items():
-            parts = sorted(parts)
-            # XXX: there might be more recovery we could do here if, for
-            # example, this is really a case of a duplicate attribute name.
+            parts = sorted(parts, key=itemgetter(0))
+            first_param = parts[0][1]
+            charset = first_param.charset
+            # Our arbitrary error recovery is to ignore duplicate parameters,
+            # to use appearance order if there are duplicate rfc 2231 parts,
+            # and to ignore gaps.  This mimics the error recovery of get_param.
+            if not first_param.extended and len(parts) > 1:
+                if parts[1][0] == 0:
+                    parts[1][1].defects.append(errors.InvalidHeaderDefect(
+                        'duplicate parameter name; duplicate(s) ignored'))
+                    parts = parts[:1]
+                # Else assume the *0* was missing...note that this is different
+                # from get_param, but we registered a defect for this earlier.
              value_parts = []
-            charset = parts[0][1].charset
-            for i, (section_number, param) in enumerate(parts):
+            i = 0
+            for section_number, param in parts:
                  if section_number != i:
-                    param.defects.append(errors.InvalidHeaderDefect(
-                        "inconsistent multipart parameter numbering"))
+                    # We could get fancier here and look for a complete
+                    # duplicate extended parameter and ignore the second one
+                    # seen.  But we're not doing that.  The old code didn't.
+                    if not param.extended:
+                        param.defects.append(errors.InvalidHeaderDefect(
+                            'duplicate parameter name; duplicate ignored'))
+                        continue
+                    else:
+                        param.defects.append(errors.InvalidHeaderDefect(
+                            "inconsistent RFC2231 parameter numbering"))
+                i += 1
                  value = param.param_value
                  if param.extended:
                      try:
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py

index 5404d1913f88fbf8c37f8c01a5e1fbe00847e105..d028f7440beac8147e76d7e7e4a56ab1840336a4 100644 (file)
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -2456,6 +2456,115 @@ class TestParser(TestParserMixin, TestEmailBase):
              ";foo", ";foo", ";foo", [errors.InvalidHeaderDefect]*3
          )
  
+
+@parameterize
+class Test_parse_mime_parameters(TestParserMixin, TestEmailBase):
+
+    def mime_parameters_as_value(self,
+                                 value,
+                                 tl_str,
+                                 tl_value,
+                                 params,
+                                 defects):
+        mime_parameters = self._test_parse_x(parser.parse_mime_parameters,
+            value, tl_str, tl_value, defects)
+        self.assertEqual(mime_parameters.token_type, 'mime-parameters')
+        self.assertEqual(list(mime_parameters.params), params)
+
+
+    mime_parameters_params = {
+
+        'simple': (
+            'filename="abc.py"',
+            ' filename="abc.py"',
+            'filename=abc.py',
+            [('filename', 'abc.py')],
+            []),
+
+        'multiple_keys': (
+            'filename="abc.py"; xyz=abc',
+            ' filename="abc.py"; xyz="abc"',
+            'filename=abc.py; xyz=abc',
+            [('filename', 'abc.py'), ('xyz', 'abc')],
+            []),
+
+        'split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66",
+            ' filename="201.tif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66",
+            [('filename', '201.tif')],
+            []),
+
+        # Note that it is undefined what we should do for error recovery when
+        # there are duplicate parameter names or duplicate parts in a split
+        # part.  We choose to ignore all duplicate parameters after the first
+        # and to take duplicate or missing rfc 2231 parts in apperance order.
+        # This is backward compatible with get_param's behavior, but the
+        # decisions are arbitrary.
+
+        'duplicate_key': (
+            'filename=abc.gif; filename=def.tiff',
+            ' filename="abc.gif"',
+            "filename=abc.gif; filename=def.tiff",
+            [('filename', 'abc.gif')],
+            [errors.InvalidHeaderDefect]),
+
+        'duplicate_key_with_split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66;"
+                " filename=abc.gif",
+            ' filename="201.tif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66;"
+                " filename=abc.gif",
+            [('filename', '201.tif')],
+            [errors.InvalidHeaderDefect]),
+
+        'duplicate_key_with_split_value_other_order': (
+            "filename=abc.gif; "
+                " filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66",
+            ' filename="abc.gif"',
+            "filename=abc.gif;"
+                " filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66",
+            [('filename', 'abc.gif')],
+            [errors.InvalidHeaderDefect]),
+
+        'duplicate_in_split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66;"
+                " filename*1*=abc.gif",
+            ' filename="201.tifabc.gif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66;"
+                " filename*1*=abc.gif",
+            [('filename', '201.tifabc.gif')],
+            [errors.InvalidHeaderDefect]),
+
+        'missing_split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66;",
+            ' filename="201.tif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66;",
+            [('filename', '201.tif')],
+            [errors.InvalidHeaderDefect]),
+
+        'duplicate_and_missing_split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66;"
+                " filename*3*=abc.gif",
+            ' filename="201.tifabc.gif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66;"
+                " filename*3*=abc.gif",
+            [('filename', '201.tifabc.gif')],
+            [errors.InvalidHeaderDefect]*2),
+
+        # Here we depart from get_param and assume the *0* was missing.
+        'duplicate_with_broken_split_value': (
+            "filename=abc.gif; "
+                " filename*2*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66",
+            ' filename="abc.gif201.tif"',
+            "filename=abc.gif;"
+                " filename*2*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66",
+            [('filename', 'abc.gif201.tif')],
+            # Defects are apparent missing *0*, and two 'out of sequence'.
+            [errors.InvalidHeaderDefect]*3),
+
+    }
+
  @parameterize
  class Test_parse_mime_version(TestParserMixin, TestEmailBase):
  
diff --git a/Misc/NEWS b/Misc/NEWS

index b80cbe25f94d8e57f2997a37cef9ee72330ee67b..ebac8d52f65380e62accbace5bbe47fd777096fe 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -21,6 +21,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #23745: The new email header parser now handles duplicate MIME
+  parameter names without error, similar to how get_param behaves.
+
  - Issue #23792: Ignore KeyboardInterrupt when the pydoc pager is active.
    This mimics the behavior of the standard unix pagers, and prevents
    pipepager from shutting down while the pager itself is still running.
author	R David Murray <rdmurray@bitdance.com>
	Mon, 30 Mar 2015 01:53:05 +0000 (21:53 -0400)
committer	R David Murray <rdmurray@bitdance.com>
	Mon, 30 Mar 2015 01:53:05 +0000 (21:53 -0400)
Lib/email/_header_value_parser.py		patch \| blob \| history
Lib/test/test_email/test__header_value_parser.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history