#18431: Decode encoded words in atoms in new email parser.

author R David Murray <rdmurray@bitdance.com>

Fri, 12 Jul 2013 20:00:28 +0000 (16:00 -0400)

committer R David Murray <rdmurray@bitdance.com>

Fri, 12 Jul 2013 20:00:28 +0000 (16:00 -0400)
author R David Murray <rdmurray@bitdance.com>
Fri, 12 Jul 2013 20:00:28 +0000 (16:00 -0400)
committer R David Murray <rdmurray@bitdance.com>
Fri, 12 Jul 2013 20:00:28 +0000 (16:00 -0400)
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index a01d845110f8f1b0fb483ec7cbb95011ddf8e533..291437c5867c168cc507d3f59861c4c5c0f02274 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -1627,6 +1627,7 @@ def get_quoted_string(value):
  def get_atom(value):
      """atom = [CFWS] 1*atext [CFWS]
  
+    An atom could be an rfc2047 encoded word.
      """
      atom = Atom()
      if value and value[0] in CFWS_LEADER:
@@ -1635,7 +1636,15 @@ def get_atom(value):
      if value and value[0] in ATOM_ENDS:
          raise errors.HeaderParseError(
              "expected atom but found '{}'".format(value))
-    token, value = get_atext(value)
+    if value.startswith('=?'):
+        try:
+            token, value = get_encoded_word(value)
+        except errors.HeaderParseError:
+            # XXX: need to figure out how to register defects when
+            # appropriate here.
+            token, value = get_atext(value)
+    else:
+        token, value = get_atext(value)
      atom.append(token)
      if value and value[0] in CFWS_LEADER:
          token, value = get_cfws(value)
@@ -1664,12 +1673,22 @@ def get_dot_atom_text(value):
  def get_dot_atom(value):
      """ dot-atom = [CFWS] dot-atom-text [CFWS]
  
+    Any place we can have a dot atom, we could instead have an rfc2047 encoded
+    word.
      """
      dot_atom = DotAtom()
      if value[0] in CFWS_LEADER:
          token, value = get_cfws(value)
          dot_atom.append(token)
-    token, value = get_dot_atom_text(value)
+    if value.startswith('=?'):
+        try:
+            token, value = get_encoded_word(value)
+        except errors.HeaderParseError:
+            # XXX: need to figure out how to register defects when
+            # appropriate here.
+            token, value = get_dot_atom_text(value)
+    else:
+        token, value = get_dot_atom_text(value)
      dot_atom.append(token)
      if value and value[0] in CFWS_LEADER:
          token, value = get_cfws(value)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py

index 8917447217c53f84218e7fdc4aea33cec441be6c..646082b4a40a24cce4d6e99c986b1abdd4663814 100644 (file)
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -808,9 +808,13 @@ class TestParser(TestParserMixin, TestEmailBase):
          self.assertEqual(atom[2].comments, ['bar'])
  
      def test_get_atom_atom_ends_at_noncfws(self):
-        atom = self._test_get_x(parser.get_atom,
+        self._test_get_x(parser.get_atom,
              'bob  fred', 'bob  ', 'bob ', [], 'fred')
  
+    def test_get_atom_rfc2047_atom(self):
+        self._test_get_x(parser.get_atom,
+            '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '')
+
      # get_dot_atom_text
  
      def test_get_dot_atom_text(self):
@@ -885,6 +889,10 @@ class TestParser(TestParserMixin, TestEmailBase):
          with self.assertRaises(errors.HeaderParseError):
              parser.get_dot_atom(' (foo) bar.bang. foo')
  
+    def test_get_dot_atom_rfc2047_atom(self):
+        self._test_get_x(parser.get_dot_atom,
+            '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '')
+
      # get_word (if this were black box we'd repeat all the qs/atom tests)
  
      def test_get_word_atom_yields_atom(self):
@@ -2156,6 +2164,22 @@ class TestParser(TestParserMixin, TestEmailBase):
          self.assertEqual(address[0].token_type,
                           'mailbox')
  
+    def test_get_address_rfc2047_display_name(self):
+        address = self._test_get_x(parser.get_address,
+            '=?utf-8?q?=C3=89ric?= <foo@example.com>',
+            'Éric <foo@example.com>',
+            'Éric <foo@example.com>',
+            [],
+            '')
+        self.assertEqual(address.token_type, 'address')
+        self.assertEqual(len(address.mailboxes), 1)
+        self.assertEqual(address.mailboxes,
+                         address.all_mailboxes)
+        self.assertEqual(address.mailboxes[0].display_name,
+                         'Éric')
+        self.assertEqual(address[0].token_type,
+                         'mailbox')
+
      def test_get_address_empty_group(self):
          address = self._test_get_x(parser.get_address,
              'Monty Python:;',
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py

index 80f1c0238e483d2bd8e86865b9eb55505520287f..f754a324316ba4d583a5a1510ca30116b9e65d39 100644 (file)
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -158,6 +158,10 @@ class TestUnstructuredHeader(TestHeaderBase):
              '=?utf-8?q?=C3=89ric?=',
              'Éric'),
  
+        'rfc2047_quopri_with_regular_text': (
+            'The =?utf-8?q?=C3=89ric=2C?= Himself',
+            'The Éric, Himself'),
+
      }
  
  
@@ -1119,6 +1123,26 @@ class TestAddressHeader(TestHeaderBase):
               'example.com',
               None),
  
+        'rfc2047_atom_is_decoded':
+            ('=?utf-8?q?=C3=89ric?= <foo@example.com>',
+            [],
+            'Éric <foo@example.com>',
+            'Éric',
+            'foo@example.com',
+            'foo',
+            'example.com',
+            None),
+
+        'rfc2047_atom_in_phrase_is_decoded':
+            ('The =?utf-8?q?=C3=89ric=2C?= Himself <foo@example.com>',
+            [],
+            '"The Éric, Himself" <foo@example.com>',
+            'The Éric, Himself',
+            'foo@example.com',
+            'foo',
+            'example.com',
+            None),
+
          }
  
          # XXX: Need many more examples, and in particular some with names in
diff --git a/Misc/NEWS b/Misc/NEWS

index c068ed81ee9e83e9652db7dd25f7c8d631ee0da9..65e60f4b781a04c081c7ead32a138f0cbf3288c5 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -47,6 +47,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #18431: The new email header parser now decodes RFC2047 encoded words
+  in structured headers.
+
  - Issue #18044: The new email header parser was mis-parsing encoded words where
    an encoded character immediately followed the '?' that follows the CTE
    character, resulting in a decoding failure.  They are now decoded correctly.
author	R David Murray <rdmurray@bitdance.com>
	Fri, 12 Jul 2013 20:00:28 +0000 (16:00 -0400)
committer	R David Murray <rdmurray@bitdance.com>
	Fri, 12 Jul 2013 20:00:28 +0000 (16:00 -0400)
Lib/email/_header_value_parser.py		patch \| blob \| history
Lib/test/test_email/test__header_value_parser.py		patch \| blob \| history
Lib/test/test_email/test_headerregistry.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history