]> granicus.if.org Git - python/commitdiff
[3.8] bpo-21315: Fix parsing of encoded words with missing leading ws (GH-13425)...
authorAshwin Ramaswami <aramaswamis@gmail.com>
Tue, 3 Sep 2019 17:08:39 +0000 (10:08 -0700)
committerMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Tue, 3 Sep 2019 17:08:39 +0000 (10:08 -0700)
* [bpo-21315](https://bugs.python.org/issue21315): Fix parsing of encoded words with missing leading ws.

Because of missing leading whitespace, encoded word would get parsed as
unstructured token. This patch fixes that by looking for encoded words when
splitting tokens with whitespace.

Missing trailing whitespace around encoded word now register a defect
instead.

Original patch suggestion by David R. Murray on [bpo-21315](https://bugs.python.org/issue21315).
(cherry picked from commit 66c4f3f38b867d8329b28c032bb907fd1a2f22d2)

Co-authored-by: Abhilash Raj <maxking@users.noreply.github.com>
(cherry picked from commit dc20fc4311dece19488299a7cd11317ffbe4d3c3)

Co-authored-by: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
https://bugs.python.org/issue21315

Lib/email/_header_value_parser.py
Lib/test/test_email/test__header_value_parser.py
Lib/test/test_email/test_headerregistry.py
Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst [new file with mode: 0644]

index ea7083fc88b8c08d66a3d7e2b91d1d00d2c4ee47..b5003943ab0d97dee00f4162ad9a66cb2174baa6 100644 (file)
@@ -96,6 +96,18 @@ EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
 def quote_string(value):
     return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
 
+# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
+rfc2047_matcher = re.compile(r'''
+   =\?            # literal =?
+   [^?]*          # charset
+   \?             # literal ?
+   [qQbB]         # literal 'q' or 'b', case insensitive
+   \?             # literal ?
+  .*?             # encoded word
+  \?=             # literal ?=
+''', re.VERBOSE | re.MULTILINE)
+
+
 #
 # TokenList and its subclasses
 #
@@ -1054,6 +1066,10 @@ def get_encoded_word(value):
         _validate_xtext(vtext)
         ew.append(vtext)
         text = ''.join(remainder)
+    # Encoded words should be followed by a WS
+    if value and value[0] not in WSP:
+        ew.defects.append(errors.InvalidHeaderDefect(
+            "missing trailing whitespace after encoded-word"))
     return ew, value
 
 def get_unstructured(value):
@@ -1106,6 +1122,11 @@ def get_unstructured(value):
                 unstructured.append(token)
                 continue
         tok, *remainder = _wsp_splitter(value, 1)
+        # Split in the middle of an atom if there is a rfc2047 encoded word
+        # which does not have WSP on both sides. The defect will be registered
+        # the next time through the loop.
+        if rfc2047_matcher.search(tok):
+            tok, *remainder = value.partition('=?')
         vtext = ValueTerminal(tok, 'vtext')
         _validate_xtext(vtext)
         unstructured.append(vtext)
index 198cf17bbf3ed82be352ed2d4efe0ed47a979d86..bad4333dbc43517578f2e57d4e86afd7618744cc 100644 (file)
@@ -118,7 +118,7 @@ class TestParser(TestParserMixin, TestEmailBase):
                          '=?us-ascii?q?first?==?utf-8?q?second?=',
                          'first',
                          'first',
-                         [],
+                         [errors.InvalidHeaderDefect],
                          '=?utf-8?q?second?=')
 
     def test_get_encoded_word_sets_extra_attributes(self):
@@ -361,6 +361,25 @@ class TestParser(TestParserMixin, TestEmailBase):
             '=?utf-8?q?foo?==?utf-8?q?bar?=',
             'foobar',
             'foobar',
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_leading_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            'nowhitespace=?utf-8?q?somevalue?=',
+            'nowhitespacesomevalue',
+            'nowhitespacesomevalue',
+            [errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_trailing_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            '=?utf-8?q?somevalue?=nowhitespace',
+            'somevaluenowhitespace',
+            'somevaluenowhitespace',
             [errors.InvalidHeaderDefect],
             '')
 
@@ -550,7 +569,8 @@ class TestParser(TestParserMixin, TestEmailBase):
             '"=?utf-8?Q?not_really_valid?="',
             '"not really valid"',
             'not really valid',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+             errors.InvalidHeaderDefect],
             '')
 
     # get_comment
index a6b48385aeac2acb7006dcfc42eda7c417052e28..8d89c5dd58322e2af899056c96f929bbbaf2f481 100644 (file)
@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):
 
         'rfc2047_atom_in_quoted_string_is_decoded':
             ('"=?utf-8?q?=C3=89ric?=" <foo@example.com>',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
             'Éric <foo@example.com>',
             'Éric',
             'foo@example.com',
diff --git a/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
new file mode 100644 (file)
index 0000000..dd0dd7f
--- /dev/null
@@ -0,0 +1,4 @@
+Email headers containing RFC2047 encoded words are parsed despite the missing
+whitespace, and a defect registered. Also missing trailing whitespace after
+encoded words is now registered as a defect.
+