]> granicus.if.org Git - python/commitdiff
#18431: Decode encoded words in atoms in new email parser.
authorR David Murray <rdmurray@bitdance.com>
Fri, 12 Jul 2013 20:00:28 +0000 (16:00 -0400)
committerR David Murray <rdmurray@bitdance.com>
Fri, 12 Jul 2013 20:00:28 +0000 (16:00 -0400)
There is more to be done here in terms of accepting RFC invalid
input that some mailers accept, but this covers the valid
RFC places where encoded words can occur in structured headers.

Lib/email/_header_value_parser.py
Lib/test/test_email/test__header_value_parser.py
Lib/test/test_email/test_headerregistry.py
Misc/NEWS

index a01d845110f8f1b0fb483ec7cbb95011ddf8e533..291437c5867c168cc507d3f59861c4c5c0f02274 100644 (file)
@@ -1627,6 +1627,7 @@ def get_quoted_string(value):
 def get_atom(value):
     """atom = [CFWS] 1*atext [CFWS]
 
+    An atom could be an rfc2047 encoded word.
     """
     atom = Atom()
     if value and value[0] in CFWS_LEADER:
@@ -1635,7 +1636,15 @@ def get_atom(value):
     if value and value[0] in ATOM_ENDS:
         raise errors.HeaderParseError(
             "expected atom but found '{}'".format(value))
-    token, value = get_atext(value)
+    if value.startswith('=?'):
+        try:
+            token, value = get_encoded_word(value)
+        except errors.HeaderParseError:
+            # XXX: need to figure out how to register defects when
+            # appropriate here.
+            token, value = get_atext(value)
+    else:
+        token, value = get_atext(value)
     atom.append(token)
     if value and value[0] in CFWS_LEADER:
         token, value = get_cfws(value)
@@ -1664,12 +1673,22 @@ def get_dot_atom_text(value):
 def get_dot_atom(value):
     """ dot-atom = [CFWS] dot-atom-text [CFWS]
 
+    Any place we can have a dot atom, we could instead have an rfc2047 encoded
+    word.
     """
     dot_atom = DotAtom()
     if value[0] in CFWS_LEADER:
         token, value = get_cfws(value)
         dot_atom.append(token)
-    token, value = get_dot_atom_text(value)
+    if value.startswith('=?'):
+        try:
+            token, value = get_encoded_word(value)
+        except errors.HeaderParseError:
+            # XXX: need to figure out how to register defects when
+            # appropriate here.
+            token, value = get_dot_atom_text(value)
+    else:
+        token, value = get_dot_atom_text(value)
     dot_atom.append(token)
     if value and value[0] in CFWS_LEADER:
         token, value = get_cfws(value)
index 8917447217c53f84218e7fdc4aea33cec441be6c..646082b4a40a24cce4d6e99c986b1abdd4663814 100644 (file)
@@ -808,9 +808,13 @@ class TestParser(TestParserMixin, TestEmailBase):
         self.assertEqual(atom[2].comments, ['bar'])
 
     def test_get_atom_atom_ends_at_noncfws(self):
-        atom = self._test_get_x(parser.get_atom,
+        self._test_get_x(parser.get_atom,
             'bob  fred', 'bob  ', 'bob ', [], 'fred')
 
+    def test_get_atom_rfc2047_atom(self):
+        self._test_get_x(parser.get_atom,
+            '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '')
+
     # get_dot_atom_text
 
     def test_get_dot_atom_text(self):
@@ -885,6 +889,10 @@ class TestParser(TestParserMixin, TestEmailBase):
         with self.assertRaises(errors.HeaderParseError):
             parser.get_dot_atom(' (foo) bar.bang. foo')
 
+    def test_get_dot_atom_rfc2047_atom(self):
+        self._test_get_x(parser.get_dot_atom,
+            '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '')
+
     # get_word (if this were black box we'd repeat all the qs/atom tests)
 
     def test_get_word_atom_yields_atom(self):
@@ -2156,6 +2164,22 @@ class TestParser(TestParserMixin, TestEmailBase):
         self.assertEqual(address[0].token_type,
                          'mailbox')
 
+    def test_get_address_rfc2047_display_name(self):
+        address = self._test_get_x(parser.get_address,
+            '=?utf-8?q?=C3=89ric?= <foo@example.com>',
+            'Éric <foo@example.com>',
+            'Éric <foo@example.com>',
+            [],
+            '')
+        self.assertEqual(address.token_type, 'address')
+        self.assertEqual(len(address.mailboxes), 1)
+        self.assertEqual(address.mailboxes,
+                         address.all_mailboxes)
+        self.assertEqual(address.mailboxes[0].display_name,
+                         'Éric')
+        self.assertEqual(address[0].token_type,
+                         'mailbox')
+
     def test_get_address_empty_group(self):
         address = self._test_get_x(parser.get_address,
             'Monty Python:;',
index 80f1c0238e483d2bd8e86865b9eb55505520287f..f754a324316ba4d583a5a1510ca30116b9e65d39 100644 (file)
@@ -158,6 +158,10 @@ class TestUnstructuredHeader(TestHeaderBase):
             '=?utf-8?q?=C3=89ric?=',
             'Éric'),
 
+        'rfc2047_quopri_with_regular_text': (
+            'The =?utf-8?q?=C3=89ric=2C?= Himself',
+            'The Éric, Himself'),
+
     }
 
 
@@ -1119,6 +1123,26 @@ class TestAddressHeader(TestHeaderBase):
              'example.com',
              None),
 
+        'rfc2047_atom_is_decoded':
+            ('=?utf-8?q?=C3=89ric?= <foo@example.com>',
+            [],
+            'Éric <foo@example.com>',
+            'Éric',
+            'foo@example.com',
+            'foo',
+            'example.com',
+            None),
+
+        'rfc2047_atom_in_phrase_is_decoded':
+            ('The =?utf-8?q?=C3=89ric=2C?= Himself <foo@example.com>',
+            [],
+            '"The Éric, Himself" <foo@example.com>',
+            'The Éric, Himself',
+            'foo@example.com',
+            'foo',
+            'example.com',
+            None),
+
         }
 
         # XXX: Need many more examples, and in particular some with names in
index c068ed81ee9e83e9652db7dd25f7c8d631ee0da9..65e60f4b781a04c081c7ead32a138f0cbf3288c5 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -47,6 +47,9 @@ Core and Builtins
 Library
 -------
 
+- Issue #18431: The new email header parser now decodes RFC2047 encoded words
+  in structured headers.
+
 - Issue #18044: The new email header parser was mis-parsing encoded words where
   an encoded character immediately followed the '?' that follows the CTE
   character, resulting in a decoding failure.  They are now decoded correctly.