[3.8] bpo-21315: Fix parsing of encoded words with missing leading ws (GH-13425)...

author Ashwin Ramaswami <aramaswamis@gmail.com>

Tue, 3 Sep 2019 17:08:39 +0000 (10:08 -0700)

committer Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>

Tue, 3 Sep 2019 17:08:39 +0000 (10:08 -0700)
author Ashwin Ramaswami <aramaswamis@gmail.com>
Tue, 3 Sep 2019 17:08:39 +0000 (10:08 -0700)
committer Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Tue, 3 Sep 2019 17:08:39 +0000 (10:08 -0700)
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index ea7083fc88b8c08d66a3d7e2b91d1d00d2c4ee47..b5003943ab0d97dee00f4162ad9a66cb2174baa6 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -96,6 +96,18 @@ EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
  def quote_string(value):
      return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
  
+# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
+rfc2047_matcher = re.compile(r'''
+   =\?            # literal =?
+   [^?]*          # charset
+   \?             # literal ?
+   [qQbB]         # literal 'q' or 'b', case insensitive
+   \?             # literal ?
+  .*?             # encoded word
+  \?=             # literal ?=
+''', re.VERBOSE | re.MULTILINE)
+
+
  #
  # TokenList and its subclasses
  #
@@ -1054,6 +1066,10 @@ def get_encoded_word(value):
          _validate_xtext(vtext)
          ew.append(vtext)
          text = ''.join(remainder)
+    # Encoded words should be followed by a WS
+    if value and value[0] not in WSP:
+        ew.defects.append(errors.InvalidHeaderDefect(
+            "missing trailing whitespace after encoded-word"))
      return ew, value
  
  def get_unstructured(value):
@@ -1106,6 +1122,11 @@ def get_unstructured(value):
                  unstructured.append(token)
                  continue
          tok, *remainder = _wsp_splitter(value, 1)
+        # Split in the middle of an atom if there is a rfc2047 encoded word
+        # which does not have WSP on both sides. The defect will be registered
+        # the next time through the loop.
+        if rfc2047_matcher.search(tok):
+            tok, *remainder = value.partition('=?')
          vtext = ValueTerminal(tok, 'vtext')
          _validate_xtext(vtext)
          unstructured.append(vtext)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py

index 198cf17bbf3ed82be352ed2d4efe0ed47a979d86..bad4333dbc43517578f2e57d4e86afd7618744cc 100644 (file)
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -118,7 +118,7 @@ class TestParser(TestParserMixin, TestEmailBase):
                           '=?us-ascii?q?first?==?utf-8?q?second?=',
                           'first',
                           'first',
-                         [],
+                         [errors.InvalidHeaderDefect],
                           '=?utf-8?q?second?=')
  
      def test_get_encoded_word_sets_extra_attributes(self):
@@ -361,6 +361,25 @@ class TestParser(TestParserMixin, TestEmailBase):
              '=?utf-8?q?foo?==?utf-8?q?bar?=',
              'foobar',
              'foobar',
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_leading_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            'nowhitespace=?utf-8?q?somevalue?=',
+            'nowhitespacesomevalue',
+            'nowhitespacesomevalue',
+            [errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_trailing_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            '=?utf-8?q?somevalue?=nowhitespace',
+            'somevaluenowhitespace',
+            'somevaluenowhitespace',
              [errors.InvalidHeaderDefect],
              '')
  
@@ -550,7 +569,8 @@ class TestParser(TestParserMixin, TestEmailBase):
              '"=?utf-8?Q?not_really_valid?="',
              '"not really valid"',
              'not really valid',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+             errors.InvalidHeaderDefect],
              '')
  
      # get_comment
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py

index a6b48385aeac2acb7006dcfc42eda7c417052e28..8d89c5dd58322e2af899056c96f929bbbaf2f481 100644 (file)
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):
  
          'rfc2047_atom_in_quoted_string_is_decoded':
              ('"=?utf-8?q?=C3=89ric?=" <foo@example.com>',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
              'Éric <foo@example.com>',
              'Éric',
              'foo@example.com',
diff --git a/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst

new file mode 100644 (file)

index 0000000..dd0dd7f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
@@ -0,0 +1,4 @@
+Email headers containing RFC2047 encoded words are parsed despite the missing
+whitespace, and a defect registered. Also missing trailing whitespace after
+encoded words is now registered as a defect.
+
author	Ashwin Ramaswami <aramaswamis@gmail.com>
	Tue, 3 Sep 2019 17:08:39 +0000 (10:08 -0700)
committer	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
	Tue, 3 Sep 2019 17:08:39 +0000 (10:08 -0700)
Lib/email/_header_value_parser.py		patch \| blob \| history
Lib/test/test_email/test__header_value_parser.py		patch \| blob \| history
Lib/test/test_email/test_headerregistry.py		patch \| blob \| history
Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst	[new file with mode: 0644]	patch \| blob