#24211: Add RFC6532 support to the email library.

author R David Murray <rdmurray@bitdance.com>

Sun, 17 May 2015 15:29:21 +0000 (11:29 -0400)

committer R David Murray <rdmurray@bitdance.com>

Sun, 17 May 2015 15:29:21 +0000 (11:29 -0400)
author R David Murray <rdmurray@bitdance.com>
Sun, 17 May 2015 15:29:21 +0000 (11:29 -0400)
committer R David Murray <rdmurray@bitdance.com>
Sun, 17 May 2015 15:29:21 +0000 (11:29 -0400)
diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst

index d4e3fc186a50dae6db4cfe301387837a25770ef8..9fadfb3493f9d19250d44d5bd7c57bc7dcefc595 100644 (file)
--- a/Doc/library/email.policy.rst
+++ b/Doc/library/email.policy.rst
@@ -378,6 +378,14 @@ added matters.  To illustrate::
     In addition to the settable attributes listed above that apply to all
     policies, this policy adds the following additional attributes:
  
+   .. attribute:: utf8
+
+      If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in
+      headers by encoding them as "encoded words".  If ``True``, follow
+      :rfc:`6532` and use ``utf-8`` encoding for headers.  Messages
+      formatted in this way may be passed to SMTP servers that support
+      the ``SMTPUTF8`` extension (:rfc:`6531`).
+
     .. attribute:: refold_source
  
        If the value for a header in the ``Message`` object originated from a
diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst

index 0360de494ee1c732a94ea98faf9e3c6e9dc0c657..51a3aa3933c6092667119a752cce373a291dfc92 100644 (file)
--- a/Doc/whatsnew/3.5.rst
+++ b/Doc/whatsnew/3.5.rst
@@ -356,6 +356,12 @@ email
    header (``None`` if there is no such header).  (Contributed by Abhilash Raj
    in :issue:`21083`.)
  
+* A new policy option :attr:`~email.policy.EmailPolicy.utf8` can be set
+  ``True`` to encode email headers using the utf8 charset instead of using
+  encoded words.  This allows ``Messages`` to be formatted according to
+  :rfc:`6532` and used with an SMTP server that supports the :rfc:`6531`
+  ``SMTPUTF8`` extension.  (Contributed by R. David Murray in :issue:`24211`.)
+
  glob
  ----
  
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index a9bdf4458b62a5df64a81c71752bdf26bca74973..f264191dc4a4452404e2d38bfa11bfdfd6b9c5c3 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -320,17 +320,18 @@ class TokenList(list):
          return ''.join(res)
  
      def _fold(self, folded):
+        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
          for part in self.parts:
              tstr = str(part)
              tlen = len(tstr)
              try:
-                str(part).encode('us-ascii')
+                str(part).encode(encoding)
              except UnicodeEncodeError:
                  if any(isinstance(x, errors.UndecodableBytesDefect)
                          for x in part.all_defects):
                      charset = 'unknown-8bit'
                  else:
-                    # XXX: this should be a policy setting
+                    # XXX: this should be a policy setting when utf8 is False.
                      charset = 'utf-8'
                  tstr = part.cte_encode(charset, folded.policy)
                  tlen = len(tstr)
@@ -394,11 +395,12 @@ class UnstructuredTokenList(TokenList):
  
      def _fold(self, folded):
          last_ew = None
+        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
          for part in self.parts:
              tstr = str(part)
              is_ew = False
              try:
-                str(part).encode('us-ascii')
+                str(part).encode(encoding)
              except UnicodeEncodeError:
                  if any(isinstance(x, errors.UndecodableBytesDefect)
                         for x in part.all_defects):
@@ -475,12 +477,13 @@ class Phrase(TokenList):
          # comment that becomes a barrier across which we can't compose encoded
          # words.
          last_ew = None
+        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
          for part in self.parts:
              tstr = str(part)
              tlen = len(tstr)
              has_ew = False
              try:
-                str(part).encode('us-ascii')
+                str(part).encode(encoding)
              except UnicodeEncodeError:
                  if any(isinstance(x, errors.UndecodableBytesDefect)
                          for x in part.all_defects):
diff --git a/Lib/email/policy.py b/Lib/email/policy.py

index f0b20f4b198ac34d58552fb415fef16f9bd8c26d..6ac64a56831d261abe3842eac1376e5b8e85bc58 100644 (file)
--- a/Lib/email/policy.py
+++ b/Lib/email/policy.py
@@ -35,6 +35,13 @@ class EmailPolicy(Policy):
      In addition to the settable attributes listed above that apply to
      all Policies, this policy adds the following additional attributes:
  
+    utf8                -- if False (the default) message headers will be
+                           serialized as ASCII, using encoded words to encode
+                           any non-ASCII characters in the source strings.  If
+                           True, the message headers will be serialized using
+                           utf8 and will not contain encoded words (see RFC
+                           6532 for more on this serialization format).
+
      refold_source       -- if the value for a header in the Message object
                             came from the parsing of some source, this attribute
                             indicates whether or not a generator should refold
@@ -72,6 +79,7 @@ class EmailPolicy(Policy):
  
      """
  
+    utf8 = False
      refold_source = 'long'
      header_factory = HeaderRegistry()
      content_manager = raw_data_manager
@@ -175,9 +183,13 @@ class EmailPolicy(Policy):
          refold_header setting, since there is no way to know whether the binary
          data consists of single byte characters or multibyte characters.
  
+        If utf8 is true, headers are encoded to utf8, otherwise to ascii with
+        non-ASCII unicode rendered as encoded words.
+
          """
          folded = self._fold(name, value, refold_binary=self.cte_type=='7bit')
-        return folded.encode('ascii', 'surrogateescape')
+        charset = 'utf8' if self.utf8 else 'ascii'
+        return folded.encode(charset, 'surrogateescape')
  
      def _fold(self, name, value, refold_binary=False):
          if hasattr(value, 'name'):
@@ -199,3 +211,4 @@ del default.header_factory
  strict = default.clone(raise_on_defect=True)
  SMTP = default.clone(linesep='\r\n')
  HTTP = default.clone(linesep='\r\n', max_line_length=None)
+SMTPUTF8 = SMTP.clone(utf8=True)
diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py

index 89174081719238e491cca8e015c0059cf455ba93..920f870c232e1dd53f83e99b62d2412bf47c8c5d 100644 (file)
--- a/Lib/test/test_email/test_generator.py
+++ b/Lib/test/test_email/test_generator.py
@@ -2,6 +2,7 @@ import io
  import textwrap
  import unittest
  from email import message_from_string, message_from_bytes
+from email.message import EmailMessage
  from email.generator import Generator, BytesGenerator
  from email import policy
  from test.test_email import TestEmailBase, parameterize
@@ -194,6 +195,27 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase):
          g.flatten(msg)
          self.assertEqual(s.getvalue(), expected)
  
+    def test_smtputf8_policy(self):
+        msg = EmailMessage()
+        msg['From'] = "Páolo <főo@bar.com>"
+        msg['To'] = 'Dinsdale'
+        msg['Subject'] = 'Nudge nudge, wink, wink \u1F609'
+        msg.set_content("oh là là, know what I mean, know what I mean?")
+        expected = textwrap.dedent("""\
+            From: Páolo <főo@bar.com>
+            To: Dinsdale
+            Subject: Nudge nudge, wink, wink \u1F609
+            Content-Type: text/plain; charset="utf-8"
+            Content-Transfer-Encoding: 8bit
+            MIME-Version: 1.0
+
+            oh là là, know what I mean, know what I mean?
+            """).encode('utf-8').replace(b'\n', b'\r\n')
+        s = io.BytesIO()
+        g = BytesGenerator(s, policy=policy.SMTPUTF8)
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), expected)
+
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/Lib/test/test_email/test_policy.py b/Lib/test/test_email/test_policy.py

index e797f36b72ed3aa6a852d9a6439d41000ea44162..4b0a04e59421936ee43813ab8ef30dca591261ae 100644 (file)
--- a/Lib/test/test_email/test_policy.py
+++ b/Lib/test/test_email/test_policy.py
@@ -27,6 +27,7 @@ class PolicyAPITests(unittest.TestCase):
      # If any of these defaults change, the docs must be updated.
      policy_defaults = compat32_defaults.copy()
      policy_defaults.update({
+        'utf8':                     False,
          'raise_on_defect':          False,
          'header_factory':           email.policy.EmailPolicy.header_factory,
          'refold_source':            'long',
@@ -42,6 +43,9 @@ class PolicyAPITests(unittest.TestCase):
          email.policy.default: make_defaults(policy_defaults, {}),
          email.policy.SMTP: make_defaults(policy_defaults,
                                           {'linesep': '\r\n'}),
+        email.policy.SMTPUTF8: make_defaults(policy_defaults,
+                                             {'linesep': '\r\n',
+                                              'utf8': True}),
          email.policy.HTTP: make_defaults(policy_defaults,
                                           {'linesep': '\r\n',
                                            'max_line_length': None}),
diff --git a/Misc/NEWS b/Misc/NEWS

index c905bccd6e618e2474cfdc1310c828081f7dac3d..5ae6031fa45c0d9bff033937fd514b7844c2f670 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -47,6 +47,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #24211: The email library now supports RFC 6532: it can generate
+  headers using utf-8 instead of encoded words.
+
  - Issue #16314: Added support for the LZMA compression in distutils.
  
  - Issue #21804: poplib now supports RFC 6856 (UTF8).
author	R David Murray <rdmurray@bitdance.com>
	Sun, 17 May 2015 15:29:21 +0000 (11:29 -0400)
committer	R David Murray <rdmurray@bitdance.com>
	Sun, 17 May 2015 15:29:21 +0000 (11:29 -0400)
Doc/library/email.policy.rst		patch \| blob \| history
Doc/whatsnew/3.5.rst		patch \| blob \| history
Lib/email/_header_value_parser.py		patch \| blob \| history
Lib/email/policy.py		patch \| blob \| history
Lib/test/test_email/test_generator.py		patch \| blob \| history
Lib/test/test_email/test_policy.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history