#23745: handle duplicate MIME parameter names in new parser.
authorR David Murray <rdmurray@bitdance.com>
Mon, 30 Mar 2015 01:53:05 +0000 (21:53 -0400)
committerR David Murray <rdmurray@bitdance.com>
Mon, 30 Mar 2015 01:53:05 +0000 (21:53 -0400)
This mimics get_param's error handling for the most part.  It is slightly
better in some regards as get_param can produce some really weird results for
duplicate *0* parts.  It departs from get_param slightly in that if we have a
mix of non-extended and extended pieces for the same parameter name, the new
parser assumes they were all supposed to be extended and concatenates all the
values, whereas get_param always picks the non-extended parameter value.  All
of this error recovery is pretty much arbitrary decisions...

Lib/email/_header_value_parser.py
Lib/test/test_email/test__header_value_parser.py
Misc/NEWS

index 1806cac9053a1573a59b9be513583fd3ba16e551..a9bdf4458b62a5df64a81c71752bdf26bca74973 100644 (file)
@@ -71,6 +71,7 @@ import re
 import urllib   # For urllib.parse.unquote
 from string import hexdigits
 from collections import OrderedDict
+from operator import itemgetter
 from email import _encoded_words as _ew
 from email import errors
 from email import utils
@@ -1098,15 +1099,34 @@ class MimeParameters(TokenList):
                 params[name] = []
             params[name].append((token.section_number, token))
         for name, parts in params.items():
-            parts = sorted(parts)
-            # XXX: there might be more recovery we could do here if, for
-            # example, this is really a case of a duplicate attribute name.
+            parts = sorted(parts, key=itemgetter(0))
+            first_param = parts[0][1]
+            charset = first_param.charset
+            # Our arbitrary error recovery is to ignore duplicate parameters,
+            # to use appearance order if there are duplicate rfc 2231 parts,
+            # and to ignore gaps.  This mimics the error recovery of get_param.
+            if not first_param.extended and len(parts) > 1:
+                if parts[1][0] == 0:
+                    parts[1][1].defects.append(errors.InvalidHeaderDefect(
+                        'duplicate parameter name; duplicate(s) ignored'))
+                    parts = parts[:1]
+                # Else assume the *0* was missing...note that this is different
+                # from get_param, but we registered a defect for this earlier.
             value_parts = []
-            charset = parts[0][1].charset
-            for i, (section_number, param) in enumerate(parts):
+            i = 0
+            for section_number, param in parts:
                 if section_number != i:
-                    param.defects.append(errors.InvalidHeaderDefect(
-                        "inconsistent multipart parameter numbering"))
+                    # We could get fancier here and look for a complete
+                    # duplicate extended parameter and ignore the second one
+                    # seen.  But we're not doing that.  The old code didn't.
+                    if not param.extended:
+                        param.defects.append(errors.InvalidHeaderDefect(
+                            'duplicate parameter name; duplicate ignored'))
+                        continue
+                    else:
+                        param.defects.append(errors.InvalidHeaderDefect(
+                            "inconsistent RFC2231 parameter numbering"))
+                i += 1
                 value = param.param_value
                 if param.extended:
                     try:
index 5404d1913f88fbf8c37f8c01a5e1fbe00847e105..d028f7440beac8147e76d7e7e4a56ab1840336a4 100644 (file)
@@ -2456,6 +2456,115 @@ class TestParser(TestParserMixin, TestEmailBase):
             ";foo", ";foo", ";foo", [errors.InvalidHeaderDefect]*3
         )
 
+
+@parameterize
+class Test_parse_mime_parameters(TestParserMixin, TestEmailBase):
+
+    def mime_parameters_as_value(self,
+                                 value,
+                                 tl_str,
+                                 tl_value,
+                                 params,
+                                 defects):
+        mime_parameters = self._test_parse_x(parser.parse_mime_parameters,
+            value, tl_str, tl_value, defects)
+        self.assertEqual(mime_parameters.token_type, 'mime-parameters')
+        self.assertEqual(list(mime_parameters.params), params)
+
+
+    mime_parameters_params = {
+
+        'simple': (
+            'filename="abc.py"',
+            ' filename="abc.py"',
+            'filename=abc.py',
+            [('filename', 'abc.py')],
+            []),
+
+        'multiple_keys': (
+            'filename="abc.py"; xyz=abc',
+            ' filename="abc.py"; xyz="abc"',
+            'filename=abc.py; xyz=abc',
+            [('filename', 'abc.py'), ('xyz', 'abc')],
+            []),
+
+        'split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66",
+            ' filename="201.tif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66",
+            [('filename', '201.tif')],
+            []),
+
+        # Note that it is undefined what we should do for error recovery when
+        # there are duplicate parameter names or duplicate parts in a split
+        # part.  We choose to ignore all duplicate parameters after the first
+        # and to take duplicate or missing rfc 2231 parts in apperance order.
+        # This is backward compatible with get_param's behavior, but the
+        # decisions are arbitrary.
+
+        'duplicate_key': (
+            'filename=abc.gif; filename=def.tiff',
+            ' filename="abc.gif"',
+            "filename=abc.gif; filename=def.tiff",
+            [('filename', 'abc.gif')],
+            [errors.InvalidHeaderDefect]),
+
+        'duplicate_key_with_split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66;"
+                " filename=abc.gif",
+            ' filename="201.tif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66;"
+                " filename=abc.gif",
+            [('filename', '201.tif')],
+            [errors.InvalidHeaderDefect]),
+
+        'duplicate_key_with_split_value_other_order': (
+            "filename=abc.gif; "
+                " filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66",
+            ' filename="abc.gif"',
+            "filename=abc.gif;"
+                " filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66",
+            [('filename', 'abc.gif')],
+            [errors.InvalidHeaderDefect]),
+
+        'duplicate_in_split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66;"
+                " filename*1*=abc.gif",
+            ' filename="201.tifabc.gif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66;"
+                " filename*1*=abc.gif",
+            [('filename', '201.tifabc.gif')],
+            [errors.InvalidHeaderDefect]),
+
+        'missing_split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66;",
+            ' filename="201.tif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66;",
+            [('filename', '201.tif')],
+            [errors.InvalidHeaderDefect]),
+
+        'duplicate_and_missing_split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66;"
+                " filename*3*=abc.gif",
+            ' filename="201.tifabc.gif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66;"
+                " filename*3*=abc.gif",
+            [('filename', '201.tifabc.gif')],
+            [errors.InvalidHeaderDefect]*2),
+
+        # Here we depart from get_param and assume the *0* was missing.
+        'duplicate_with_broken_split_value': (
+            "filename=abc.gif; "
+                " filename*2*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66",
+            ' filename="abc.gif201.tif"',
+            "filename=abc.gif;"
+                " filename*2*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66",
+            [('filename', 'abc.gif201.tif')],
+            # Defects are apparent missing *0*, and two 'out of sequence'.
+            [errors.InvalidHeaderDefect]*3),
+
+    }
+
 @parameterize
 class Test_parse_mime_version(TestParserMixin, TestEmailBase):
 
index b80cbe25f94d8e57f2997a37cef9ee72330ee67b..ebac8d52f65380e62accbace5bbe47fd777096fe 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -21,6 +21,9 @@ Core and Builtins
 Library
 -------
 
+- Issue #23745: The new email header parser now handles duplicate MIME
+  parameter names without error, similar to how get_param behaves.
+
 - Issue #23792: Ignore KeyboardInterrupt when the pydoc pager is active.
   This mimics the behavior of the standard unix pagers, and prevents
   pipepager from shutting down while the pager itself is still running.