[3.8] bpo-37764: Fix infinite loop when parsing unstructured email headers. (GH-15239...

author Abhilash Raj <maxking@users.noreply.github.com>

Thu, 5 Sep 2019 01:20:40 +0000 (18:20 -0700)

committer GitHub <noreply@github.com>

Thu, 5 Sep 2019 01:20:40 +0000 (18:20 -0700)
author Abhilash Raj <maxking@users.noreply.github.com>
Thu, 5 Sep 2019 01:20:40 +0000 (18:20 -0700)
committer GitHub <noreply@github.com>
Thu, 5 Sep 2019 01:20:40 +0000 (18:20 -0700)
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index b5003943ab0d97dee00f4162ad9a66cb2174baa6..16c19907d68d5915c7fbc3fb7655f16f5b7d5bb0 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -935,6 +935,10 @@ class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
          return ''
  
  
+class _InvalidEwError(errors.HeaderParseError):
+    """Invalid encoded word found while parsing headers."""
+
+
  # XXX these need to become classes and used as instances so
  # that a program can't change them in a parse tree and screw
  # up other parse trees.  Maybe should have  tests for that, too.
@@ -1039,7 +1043,10 @@ def get_encoded_word(value):
          raise errors.HeaderParseError(
              "expected encoded word but found {}".format(value))
      remstr = ''.join(remainder)
-    if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
+    if (len(remstr) > 1 and
+        remstr[0] in hexdigits and
+        remstr[1] in hexdigits and
+        tok.count('?') < 2):
          # The ? after the CTE was followed by an encoded word escape (=XX).
          rest, *remainder = remstr.split('?=', 1)
          tok = tok + '?=' + rest
@@ -1051,7 +1058,7 @@ def get_encoded_word(value):
      try:
          text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
      except ValueError:
-        raise errors.HeaderParseError(
+        raise _InvalidEwError(
              "encoded word format invalid: '{}'".format(ew.cte))
      ew.charset = charset
      ew.lang = lang
@@ -1101,9 +1108,12 @@ def get_unstructured(value):
              token, value = get_fws(value)
              unstructured.append(token)
              continue
+        valid_ew = True
          if value.startswith('=?'):
              try:
                  token, value = get_encoded_word(value)
+            except _InvalidEwError:
+                valid_ew = False
              except errors.HeaderParseError:
                  # XXX: Need to figure out how to register defects when
                  # appropriate here.
@@ -1125,7 +1135,10 @@ def get_unstructured(value):
          # Split in the middle of an atom if there is a rfc2047 encoded word
          # which does not have WSP on both sides. The defect will be registered
          # the next time through the loop.
-        if rfc2047_matcher.search(tok):
+        # This needs to only be performed when the encoded word is valid;
+        # otherwise, performing it on an invalid encoded word can cause
+        # the parser to go in an infinite loop.
+        if valid_ew and rfc2047_matcher.search(tok):
              tok, *remainder = value.partition('=?')
          vtext = ValueTerminal(tok, 'vtext')
          _validate_xtext(vtext)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py

index bad4333dbc43517578f2e57d4e86afd7618744cc..dd33b065c804bc75e81972d809a48cf02aaaefc2 100644 (file)
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -383,6 +383,22 @@ class TestParser(TestParserMixin, TestEmailBase):
              [errors.InvalidHeaderDefect],
              '')
  
+    def test_get_unstructured_without_trailing_whitespace_hang_case(self):
+        self._test_get_x(self._get_unst,
+            '=?utf-8?q?somevalue?=aa',
+            'somevalueaa',
+            'somevalueaa',
+            [errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_invalid_ew(self):
+        self._test_get_x(self._get_unst,
+            '=?utf-8?q?=somevalue?=',
+            '=?utf-8?q?=somevalue?=',
+            '=?utf-8?q?=somevalue?=',
+            [],
+            '')
+
      # get_qp_ctext
  
      def test_get_qp_ctext_only(self):
diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py

index aa775881c5521aee488e8c8fa14c46b44d6ee80c..5414cf070cc12fc9cf736dc4918d0e72c2b00ac2 100644 (file)
--- a/Lib/test/test_email/test_email.py
+++ b/Lib/test/test_email/test_email.py
@@ -5381,6 +5381,27 @@ Content-Type: application/x-foo;
          eq(language, 'en-us')
          eq(s, 'My Document For You')
  
+    def test_should_not_hang_on_invalid_ew_messages(self):
+        messages = ["""From: user@host.com
+To: user@host.com
+Bad-Header:
+ =?us-ascii?Q?LCSwrV11+IB0rSbSker+M9vWR7wEDSuGqmHD89Gt=ea0nJFSaiz4vX3XMJPT4vrE?=
+ =?us-ascii?Q?xGUZeOnp0o22pLBB7CYLH74Js=wOlK6Tfru2U47qR?=
+ =?us-ascii?Q?72OfyEY2p2=2FrA9xNFyvH+fBTCmazxwzF8nGkK6D?=
+
+Hello!
+""", """From: ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ <xxx@xxx>
+To: "xxx" <xxx@xxx>
+Subject:   ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½
+MIME-Version: 1.0
+Content-Type: text/plain; charset="windows-1251";
+Content-Transfer-Encoding: 8bit
+
+ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ ï¿½ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½
+"""]
+        for m in messages:
+            with self.subTest(m=m):
+                msg = email.message_from_string(m)
  
  
  # Tests to ensure that signed parts of an email are completely preserved, as
diff --git a/Misc/ACKS b/Misc/ACKS

index 24e327a5f86e99fe1792b171eea567695464e5fb..def874b0071e6f757d5b28f1f2d09f9074aa4008 100644 (file)
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -1330,6 +1330,7 @@ Burton Radons
  Abhilash Raj
  Shorya Raj
  Dhushyanth Ramasamy
+Ashwin Ramaswami
  Jeff Ramnani
  Bayard Randel
  Varpu Rantala
diff --git a/Misc/NEWS.d/next/Security/2019-08-27-01-13-05.bpo-37764.qv67PQ.rst b/Misc/NEWS.d/next/Security/2019-08-27-01-13-05.bpo-37764.qv67PQ.rst

new file mode 100644 (file)

index 0000000..27fa8e1
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2019-08-27-01-13-05.bpo-37764.qv67PQ.rst
@@ -0,0 +1 @@
+Fixes email._header_value_parser.get_unstructured going into an infinite loop for a specific case in which the email header does not have trailing whitespace, and the case in which it contains an invalid encoded word. Patch by Ashwin Ramaswami.
+\ No newline at end of file
author	Abhilash Raj <maxking@users.noreply.github.com>
	Thu, 5 Sep 2019 01:20:40 +0000 (18:20 -0700)
committer	GitHub <noreply@github.com>
	Thu, 5 Sep 2019 01:20:40 +0000 (18:20 -0700)
Lib/email/_header_value_parser.py		patch \| blob \| history
Lib/test/test_email/test__header_value_parser.py		patch \| blob \| history
Lib/test/test_email/test_email.py		patch \| blob \| history
Misc/ACKS		patch \| blob \| history
Misc/NEWS.d/next/Security/2019-08-27-01-13-05.bpo-37764.qv67PQ.rst	[new file with mode: 0644]	patch \| blob