#10790: make append work when output codec is different from input codec

author R. David Murray <rdmurray@bitdance.com>

Wed, 5 Jan 2011 01:39:32 +0000 (01:39 +0000)

committer R. David Murray <rdmurray@bitdance.com>

Wed, 5 Jan 2011 01:39:32 +0000 (01:39 +0000)
author R. David Murray <rdmurray@bitdance.com>
Wed, 5 Jan 2011 01:39:32 +0000 (01:39 +0000)
committer R. David Murray <rdmurray@bitdance.com>
Wed, 5 Jan 2011 01:39:32 +0000 (01:39 +0000)
diff --git a/Doc/library/email.header.rst b/Doc/library/email.header.rst

index 113a6b872fe4eca003bd85f8f8fc2ce6f65bff21..808f7e5f09c26c7202af47f239ff456e96d94381 100644 (file)
--- a/Doc/library/email.header.rst
+++ b/Doc/library/email.header.rst
@@ -94,14 +94,15 @@ Here is the :class:`Header` class description:
        decoded with that character set.
  
        If *s* is an instance of :class:`str`, then *charset* is a hint specifying
-      the character set of the characters in the string.  In this case, when
-      producing an :rfc:`2822`\ -compliant header using :rfc:`2047` rules, the
-      Unicode string will be encoded using the following charsets in order:
-      ``us-ascii``, the *charset* hint, ``utf-8``.  The first character set to
-      not provoke a :exc:`UnicodeError` is used.
-
-      Optional *errors* is passed through to any :func:`encode` or
-      :func:`ustr.encode` call, and defaults to "strict".
+      the character set of the characters in the string.
+
+      In either case, when producing an :rfc:`2822`\ -compliant header using
+      :rfc:`2047` rules, the string will be encoded using the output codec of
+      the charset.  If the string cannot be encoded using the output codec, a
+      UnicodeError will be raised.
+
+      Optional *errors* is passed as the errors argument to the decode call
+      if *s* is a byte string.
  
  
     .. method:: encode(splitchars=';, \\t', maxlinelen=None, linesep='\\n')
diff --git a/Lib/email/header.py b/Lib/email/header.py

index bb2c5ee76513149e587085b71f5dc0df38586148..94eb1a94aabc58f926209699ccd44c60ae00bda5 100644 (file)
--- a/Lib/email/header.py
+++ b/Lib/email/header.py
@@ -245,32 +245,26 @@ class Header:
          that byte string, and a UnicodeError will be raised if the string
          cannot be decoded with that charset.  If s is a Unicode string, then
          charset is a hint specifying the character set of the characters in
-        the string.  In this case, when producing an RFC 2822 compliant header
-        using RFC 2047 rules, the Unicode string will be encoded using the
-        following charsets in order: us-ascii, the charset hint, utf-8.  The
-        first character set not to provoke a UnicodeError is used.
+        the string.  In either case, when producing an RFC 2822 compliant
+        header using RFC 2047 rules, the string will be encoded using the
+        output codec of the charset.  If the string cannot be encoded to the
+        output codec, a UnicodeError will be raised.
  
-        Optional `errors' is passed as the third argument to any unicode() or
-        ustr.encode() call.
+        Optional `errors' is passed as the errors argument to the decode
+        call if s is a byte string.
          """
          if charset is None:
              charset = self._charset
          elif not isinstance(charset, Charset):
              charset = Charset(charset)
-        if isinstance(s, str):
-            # Convert the string from the input character set to the output
-            # character set and store the resulting bytes and the charset for
-            # composition later.
+        if not isinstance(s, str):
              input_charset = charset.input_codec or 'us-ascii'
-            input_bytes = s.encode(input_charset, errors)
-        else:
-            # We already have the bytes we will store internally.
-            input_bytes = s
+            s = s.decode(input_charset, errors)
          # Ensure that the bytes we're storing can be decoded to the output
          # character set, otherwise an early error is thrown.
          output_charset = charset.output_codec or 'us-ascii'
-        output_string = input_bytes.decode(output_charset, errors)
-        self._chunks.append((output_string, charset))
+        s.encode(output_charset, errors)
+        self._chunks.append((s, charset))
  
      def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
          """Encode a message header into an RFC-compliant format.
diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py

index 73ac3470e70300a0360b0760ca1712db397f64c0..e7fcee3b6d5f09e064f8480c78bd34317499ddbd 100644 (file)
--- a/Lib/email/test/test_email.py
+++ b/Lib/email/test/test_email.py
@@ -3620,6 +3620,10 @@ A very long line that must get split to something other than at the
          s = 'Subject: =?EUC-KR?B?CSixpLDtKSC/7Liuvsax4iC6uLmwMcijIKHaILzSwd/H0SC8+LCjwLsgv7W/+Mj3I ?='
          raises(errors.HeaderParseError, decode_header, s)
  
+    def test_shift_jis_charset(self):
+        h = Header('文', charset='shift_jis')
+        self.assertEqual(h.encode(), '=?iso-2022-jp?b?GyRCSjgbKEI=?=')
+
  
  
  # Test RFC 2231 header parameters (en/de)coding
diff --git a/Misc/NEWS b/Misc/NEWS

index 5dd389ce855380540c5df7cd36ebf6bb0a15ce45..7ce7445716a6c3cc95a82a2aa909d9bce6dd65b1 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -30,6 +30,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #10790: email.header.Header.append's charset logic now works correctly
+  for charsets whose output codec is different from its input codec.
+
  - Issue #10819: SocketIO.name property returns -1 when its closed, instead of
    raising a ValueError, to fix repr().
author	R. David Murray <rdmurray@bitdance.com>
	Wed, 5 Jan 2011 01:39:32 +0000 (01:39 +0000)
committer	R. David Murray <rdmurray@bitdance.com>
	Wed, 5 Jan 2011 01:39:32 +0000 (01:39 +0000)
Doc/library/email.header.rst		patch \| blob \| history
Lib/email/header.py		patch \| blob \| history
Lib/email/test/test_email.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history