bpo-21315: Fix parsing of encoded words with missing leading ws. (#13425)

author Abhilash Raj <maxking@users.noreply.github.com>

Wed, 5 Jun 2019 16:56:33 +0000 (12:56 -0400)

committer Barry Warsaw <barry@python.org>

Wed, 5 Jun 2019 16:56:33 +0000 (09:56 -0700)
author Abhilash Raj <maxking@users.noreply.github.com>
Wed, 5 Jun 2019 16:56:33 +0000 (12:56 -0400)
committer Barry Warsaw <barry@python.org>
Wed, 5 Jun 2019 16:56:33 +0000 (09:56 -0700)
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index 34969ab59151193432a18bf90274517791b0f410..35d746aa50825a3d19bb6424343d25a4fa7b7922 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -96,6 +96,18 @@ EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
  def quote_string(value):
      return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
  
+# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
+rfc2047_matcher = re.compile(r'''
+   =\?            # literal =?
+   [^?]*          # charset
+   \?             # literal ?
+   [qQbB]         # literal 'q' or 'b', case insensitive
+   \?             # literal ?
+  .*?             # encoded word
+  \?=             # literal ?=
+''', re.VERBOSE | re.MULTILINE)
+
+
  #
  # TokenList and its subclasses
  #
@@ -1052,6 +1064,10 @@ def get_encoded_word(value):
          _validate_xtext(vtext)
          ew.append(vtext)
          text = ''.join(remainder)
+    # Encoded words should be followed by a WS
+    if value and value[0] not in WSP:
+        ew.defects.append(errors.InvalidHeaderDefect(
+            "missing trailing whitespace after encoded-word"))
      return ew, value
  
  def get_unstructured(value):
@@ -1104,6 +1120,11 @@ def get_unstructured(value):
                  unstructured.append(token)
                  continue
          tok, *remainder = _wsp_splitter(value, 1)
+        # Split in the middle of an atom if there is a rfc2047 encoded word
+        # which does not have WSP on both sides. The defect will be registered
+        # the next time through the loop.
+        if rfc2047_matcher.search(tok):
+            tok, *remainder = value.partition('=?')
          vtext = ValueTerminal(tok, 'vtext')
          _validate_xtext(vtext)
          unstructured.append(vtext)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py

index 12da3cffb84c8818084f139f53f4864e51143618..649923fa6c8667e466b28c8ad7cdc295922aea0d 100644 (file)
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -118,7 +118,7 @@ class TestParser(TestParserMixin, TestEmailBase):
                           '=?us-ascii?q?first?==?utf-8?q?second?=',
                           'first',
                           'first',
-                         [],
+                         [errors.InvalidHeaderDefect],
                           '=?utf-8?q?second?=')
  
      def test_get_encoded_word_sets_extra_attributes(self):
@@ -361,6 +361,25 @@ class TestParser(TestParserMixin, TestEmailBase):
              '=?utf-8?q?foo?==?utf-8?q?bar?=',
              'foobar',
              'foobar',
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_leading_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            'nowhitespace=?utf-8?q?somevalue?=',
+            'nowhitespacesomevalue',
+            'nowhitespacesomevalue',
+            [errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_trailing_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            '=?utf-8?q?somevalue?=nowhitespace',
+            'somevaluenowhitespace',
+            'somevaluenowhitespace',
              [errors.InvalidHeaderDefect],
              '')
  
@@ -546,7 +565,8 @@ class TestParser(TestParserMixin, TestEmailBase):
              '"=?utf-8?Q?not_really_valid?="',
              '"not really valid"',
              'not really valid',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+             errors.InvalidHeaderDefect],
              '')
  
      # get_comment
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py

index 75505460aba89eb5c33d09411194dc520fb33647..5d9b3576d306571015966358dafd07121b010a5f 100644 (file)
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):
  
          'rfc2047_atom_in_quoted_string_is_decoded':
              ('"=?utf-8?q?=C3=89ric?=" <foo@example.com>',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
              'Éric <foo@example.com>',
              'Éric',
              'foo@example.com',
diff --git a/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst

new file mode 100644 (file)

index 0000000..dd0dd7f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
@@ -0,0 +1,4 @@
+Email headers containing RFC2047 encoded words are parsed despite the missing
+whitespace, and a defect registered. Also missing trailing whitespace after
+encoded words is now registered as a defect.
+
author	Abhilash Raj <maxking@users.noreply.github.com>
	Wed, 5 Jun 2019 16:56:33 +0000 (12:56 -0400)
committer	Barry Warsaw <barry@python.org>
	Wed, 5 Jun 2019 16:56:33 +0000 (09:56 -0700)
Lib/email/_header_value_parser.py		patch \| blob \| history
Lib/test/test_email/test__header_value_parser.py		patch \| blob \| history
Lib/test/test_email/test_headerregistry.py		patch \| blob \| history
Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst	[new file with mode: 0644]	patch \| blob