#22233: Only split headers on \r and/or \n, per email RFCs.

author R David Murray <rdmurray@bitdance.com>

Wed, 7 Sep 2016 21:44:34 +0000 (17:44 -0400)

committer R David Murray <rdmurray@bitdance.com>

Wed, 7 Sep 2016 21:44:34 +0000 (17:44 -0400)
author R David Murray <rdmurray@bitdance.com>
Wed, 7 Sep 2016 21:44:34 +0000 (17:44 -0400)
committer R David Murray <rdmurray@bitdance.com>
Wed, 7 Sep 2016 21:44:34 +0000 (17:44 -0400)
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py

index c54201819fa0287373b52b51672cc3f5ce335d43..0b312e567b17be11a645f0851fe88e62820e03ab 100644 (file)
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -27,6 +27,7 @@ from email import errors
  from email import message
  from email._policybase import compat32
  from collections import deque
+from io import StringIO
  
  NLCRE = re.compile('\r\n|\r|\n')
  NLCRE_bol = re.compile('(\r\n|\r|\n)')
@@ -51,8 +52,9 @@ class BufferedSubFile(object):
      simple abstraction -- it parses until EOF closes the current message.
      """
      def __init__(self):
-        # Chunks of the last partial line pushed into this object.
-        self._partial = []
+        # Text stream of the last partial line pushed into this object.
+        # See issue 22233 for why this is a text stream and not a list.
+        self._partial = StringIO(newline='')
          # A deque of full, pushed lines
          self._lines = deque()
          # The stack of false-EOF checking predicates.
@@ -68,8 +70,10 @@ class BufferedSubFile(object):
  
      def close(self):
          # Don't forget any trailing partial line.
-        self.pushlines(''.join(self._partial).splitlines(True))
-        self._partial = []
+        self._partial.seek(0)
+        self.pushlines(self._partial.readlines())
+        self._partial.seek(0)
+        self._partial.truncate()
          self._closed = True
  
      def readline(self):
@@ -97,26 +101,23 @@ class BufferedSubFile(object):
  
      def push(self, data):
          """Push some new data into this object."""
-        # Crack into lines, but preserve the linesep characters on the end of each
-        parts = data.splitlines(True)
-
-        if not parts or not parts[0].endswith(('\n', '\r')):
-            # No new complete lines, so just accumulate partials
-            self._partial += parts
+        self._partial.write(data)
+        if '\n' not in data and '\r' not in data:
+            # No new complete lines, wait for more.
              return
  
-        if self._partial:
-            # If there are previous leftovers, complete them now
-            self._partial.append(parts[0])
-            parts[0:1] = ''.join(self._partial).splitlines(True)
-            del self._partial[:]
+        # Crack into lines, preserving the linesep characters.
+        self._partial.seek(0)
+        parts = self._partial.readlines()
+        self._partial.seek(0)
+        self._partial.truncate()
  
          # If the last element of the list does not end in a newline, then treat
          # it as a partial line.  We only check for '\n' here because a line
          # ending with '\r' might be a line that was split in the middle of a
          # '\r\n' sequence (see bugs 1555570 and 1721862).
          if not parts[-1].endswith('\n'):
-            self._partial = [parts.pop()]
+            self._partial.write(parts.pop())
          self.pushlines(parts)
  
      def pushlines(self, lines):
diff --git a/Lib/email/policy.py b/Lib/email/policy.py

index 6ac64a56831d261abe3842eac1376e5b8e85bc58..35d0e699c6de212f4ccd41ccc4b24ae65129fe2b 100644 (file)
--- a/Lib/email/policy.py
+++ b/Lib/email/policy.py
@@ -2,6 +2,7 @@
  code that adds all the email6 features.
  """
  
+import re
  from email._policybase import Policy, Compat32, compat32, _extend_docstrings
  from email.utils import _has_surrogates
  from email.headerregistry import HeaderRegistry as HeaderRegistry
@@ -18,6 +19,8 @@ __all__ = [
      'HTTP',
      ]
  
+linesep_splitter = re.compile(r'\n|\r')
+
  @_extend_docstrings
  class EmailPolicy(Policy):
  
@@ -135,6 +138,8 @@ class EmailPolicy(Policy):
          if hasattr(value, 'name') and value.name.lower() == name.lower():
              return (name, value)
          if isinstance(value, str) and len(value.splitlines())>1:
+            # XXX this error message isn't quite right when we use splitlines
+            # (see issue 22233), but I'm not sure what should happen here.
              raise ValueError("Header values may not contain linefeed "
                               "or carriage return characters")
          return (name, self.header_factory(name, value))
@@ -150,7 +155,9 @@ class EmailPolicy(Policy):
          """
          if hasattr(value, 'name'):
              return value
-        return self.header_factory(name, ''.join(value.splitlines()))
+        # We can't use splitlines here because it splits on more than \r and \n.
+        value = ''.join(linesep_splitter.split(value))
+        return self.header_factory(name, value)
  
      def fold(self, name, value):
          """+
diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py

index 90fd9e197009b27a5ff37480ed2f36d61f989da9..8e407f70da44753878e817de3c5dbc691759bb9b 100644 (file)
--- a/Lib/test/test_email/test_email.py
+++ b/Lib/test/test_email/test_email.py
@@ -3444,10 +3444,12 @@ class TestFeedParsers(TestEmailBase):
          self.assertEqual(m.keys(), ['a', 'b'])
          m = self.parse(['a:\r', '\nb:\n'])
          self.assertEqual(m.keys(), ['a', 'b'])
+
+        # Only CR and LF should break header fields
          m = self.parse(['a:\x85b:\u2028c:\n'])
-        self.assertEqual(m.items(), [('a', '\x85'), ('b', '\u2028'), ('c', '')])
+        self.assertEqual(m.items(), [('a', '\x85b:\u2028c:')])
          m = self.parse(['a:\r', 'b:\x85', 'c:\n'])
-        self.assertEqual(m.items(), [('a', ''), ('b', '\x85'), ('c', '')])
+        self.assertEqual(m.items(), [('a', ''), ('b', '\x85c:')])
  
      def test_long_lines(self):
          # Expected peak memory use on 32-bit platform: 6*N*M bytes.
diff --git a/Lib/test/test_email/test_parser.py b/Lib/test/test_email/test_parser.py

index b54fdd75897ca0564cf81cefdd27e59bb8d45683..8ddc176389db73d7edc309f26ccd0fd5a42c8a2f 100644 (file)
--- a/Lib/test/test_email/test_parser.py
+++ b/Lib/test/test_email/test_parser.py
@@ -2,6 +2,7 @@ import io
  import email
  import unittest
  from email.message import Message
+from email.policy import default
  from test.test_email import TestEmailBase
  
  
@@ -32,5 +33,45 @@ class TestCustomMessage(TestEmailBase):
      # XXX add tests for other functions that take Message arg.
  
  
+class TestParserBase:
+
+    def test_only_split_on_cr_lf(self):
+        # The unicode line splitter splits on unicode linebreaks, which are
+        # more numerous than allowed by the email RFCs; make sure we are only
+        # splitting on those two.
+        msg = self.parser(
+            "Next-Line: not\x85broken\r\n"
+            "Null: not\x00broken\r\n"
+            "Vertical-Tab: not\vbroken\r\n"
+            "Form-Feed: not\fbroken\r\n"
+            "File-Separator: not\x1Cbroken\r\n"
+            "Group-Separator: not\x1Dbroken\r\n"
+            "Record-Separator: not\x1Ebroken\r\n"
+            "Line-Separator: not\u2028broken\r\n"
+            "Paragraph-Separator: not\u2029broken\r\n"
+            "\r\n",
+            policy=default,
+        )
+        self.assertEqual(msg.items(), [
+            ("Next-Line", "not\x85broken"),
+            ("Null", "not\x00broken"),
+            ("Vertical-Tab", "not\vbroken"),
+            ("Form-Feed", "not\fbroken"),
+            ("File-Separator", "not\x1Cbroken"),
+            ("Group-Separator", "not\x1Dbroken"),
+            ("Record-Separator", "not\x1Ebroken"),
+            ("Line-Separator", "not\u2028broken"),
+            ("Paragraph-Separator", "not\u2029broken"),
+        ])
+        self.assertEqual(msg.get_payload(), "")
+
+class TestParser(TestParserBase, TestEmailBase):
+    parser = staticmethod(email.message_from_string)
+
+class TestBytesParser(TestParserBase, TestEmailBase):
+    def parser(self, s, *args, **kw):
+        return email.message_from_bytes(s.encode(), *args, **kw)
+
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/Lib/test/test_httplib.py b/Lib/test/test_httplib.py

index f45e352d6af098d32537eaf8a072dc5a744cd049..155ab0f19b0fdb79befb6b6ee9a294a8da9b87a7 100644 (file)
--- a/Lib/test/test_httplib.py
+++ b/Lib/test/test_httplib.py
@@ -283,6 +283,36 @@ class HeaderTests(TestCase):
          self.assertEqual(resp.getheader('First'), 'val')
          self.assertEqual(resp.getheader('Second'), 'val')
  
+    def test_parse_all_octets(self):
+        # Ensure no valid header field octet breaks the parser
+        body = (
+            b'HTTP/1.1 200 OK\r\n'
+            b"!#$%&'*+-.^_`|~: value\r\n"  # Special token characters
+            b'VCHAR: ' + bytes(range(0x21, 0x7E + 1)) + b'\r\n'
+            b'obs-text: ' + bytes(range(0x80, 0xFF + 1)) + b'\r\n'
+            b'obs-fold: text\r\n'
+            b' folded with space\r\n'
+            b'\tfolded with tab\r\n'
+            b'Content-Length: 0\r\n'
+            b'\r\n'
+        )
+        sock = FakeSocket(body)
+        resp = client.HTTPResponse(sock)
+        resp.begin()
+        self.assertEqual(resp.getheader('Content-Length'), '0')
+        self.assertEqual(resp.msg['Content-Length'], '0')
+        self.assertEqual(resp.getheader("!#$%&'*+-.^_`|~"), 'value')
+        self.assertEqual(resp.msg["!#$%&'*+-.^_`|~"], 'value')
+        vchar = ''.join(map(chr, range(0x21, 0x7E + 1)))
+        self.assertEqual(resp.getheader('VCHAR'), vchar)
+        self.assertEqual(resp.msg['VCHAR'], vchar)
+        self.assertIsNotNone(resp.getheader('obs-text'))
+        self.assertIn('obs-text', resp.msg)
+        for folded in (resp.getheader('obs-fold'), resp.msg['obs-fold']):
+            self.assertTrue(folded.startswith('text'))
+            self.assertIn(' folded with space', folded)
+            self.assertTrue(folded.endswith('folded with tab'))
+
      def test_invalid_headers(self):
          conn = client.HTTPConnection('example.com')
          conn.sock = FakeSocket('')
diff --git a/Misc/NEWS b/Misc/NEWS

index bfc693aaf05f08803997f4eb8291601d5b550d7a..9437b86f30de8177b2831277586f87d9c3fb9abd 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -62,6 +62,10 @@ Core and Builtins
  Library
  -------
  
+- Issue #22233: Break email header lines *only* on the RFC specified CR and LF
+  characters, not on arbitrary unicode line breaks.  This also fixes a bug in
+  HTTP header parsing.
+
  - Issue 27988: Fix email iter_attachments incorrect mutation of payload list.
  
  - Issue #27691: Fix ssl module's parsing of GEN_RID subject alternative name
author	R David Murray <rdmurray@bitdance.com>
	Wed, 7 Sep 2016 21:44:34 +0000 (17:44 -0400)
committer	R David Murray <rdmurray@bitdance.com>
	Wed, 7 Sep 2016 21:44:34 +0000 (17:44 -0400)
Lib/email/feedparser.py		patch \| blob \| history
Lib/email/policy.py		patch \| blob \| history
Lib/test/test_email/test_email.py		patch \| blob \| history
Lib/test/test_email/test_parser.py		patch \| blob \| history
Lib/test/test_httplib.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history