From 8243ddb6ca5c0f78764a28f044f0c0284774d317 Mon Sep 17 00:00:00 2001
From: Victor Stinner <victor.stinner@haypocalc.com>
Date: Wed, 28 Jul 2010 01:58:41 +0000
Subject: [PATCH] Issue #5006: Better handling of unicode byte-order marks
 (BOM) in the io library. This means, for example, that opening an UTF-16 text
 file in append mode doesn't add a BOM at the end of the file if the file
 isn't empty.

---
 Lib/io.py           | 20 ++++++++++++++++++++
 Lib/test/test_io.py | 31 +++++++++++++++++++++++++++++++
 Misc/NEWS           |  4 ++++
 3 files changed, 55 insertions(+)

diff --git a/Lib/io.py b/Lib/io.py
index 1458b471fa..9013c58ef1 100644
--- a/Lib/io.py
+++ b/Lib/io.py
@@ -1440,6 +1440,15 @@ class TextIOWrapper(TextIOBase):
         self._snapshot = None  # info for reconstructing decoder state
         self._seekable = self._telling = self.buffer.seekable()
 
+        if self._seekable and self.writable():
+            position = self.buffer.tell()
+            if position != 0:
+                try:
+                    self._get_encoder().setstate(0)
+                except LookupError:
+                    # Sometimes the encoder doesn't exist
+                    pass
+
     # self._snapshot is either None, or a tuple (dec_flags, next_input)
     # where dec_flags is the second (integer) item of the decoder state
     # and next_input is the chunk of input bytes that comes next after the
@@ -1726,6 +1735,17 @@ class TextIOWrapper(TextIOBase):
                 raise IOError("can't restore logical file position")
             self._decoded_chars_used = chars_to_skip
 
+        # Finally, reset the encoder (merely useful for proper BOM handling)
+        try:
+            encoder = self._encoder or self._get_encoder()
+        except LookupError:
+            # Sometimes the encoder doesn't exist
+            pass
+        else:
+            if cookie != 0:
+                encoder.setstate(0)
+            else:
+                encoder.reset()
         return cookie
 
     def read(self, n=None):
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py
index aebe67bd87..5cfa472916 100644
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -799,6 +799,37 @@ class StatefulIncrementalDecoderTest(unittest.TestCase):
         self.assertEquals(d.decode(b'oiabcd'), '')
         self.assertEquals(d.decode(b'', 1), 'abcd.')
 
+    def test_append_bom(self):
+        # The BOM is not written again when appending to a non-empty file
+        filename = test_support.TESTFN
+        for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
+            with io.open(filename, 'w', encoding=charset) as f:
+                f.write('aaa')
+                pos = f.tell()
+            with io.open(filename, 'rb') as f:
+                self.assertEquals(f.read(), 'aaa'.encode(charset))
+
+            with io.open(filename, 'a', encoding=charset) as f:
+                f.write('xxx')
+            with io.open(filename, 'rb') as f:
+                self.assertEquals(f.read(), 'aaaxxx'.encode(charset))
+
+    def test_seek_bom(self):
+        # Same test, but when seeking manually
+        filename = test_support.TESTFN
+        for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
+            with io.open(filename, 'w', encoding=charset) as f:
+                f.write('aaa')
+                pos = f.tell()
+            with io.open(filename, 'r+', encoding=charset) as f:
+                f.seek(pos)
+                f.write('zzz')
+                f.seek(0)
+                f.write('bbb')
+            with io.open(filename, 'rb') as f:
+                self.assertEquals(f.read(), 'bbbzzz'.encode(charset))
+
+
 class TextIOWrapperTest(unittest.TestCase):
 
     def setUp(self):
diff --git a/Misc/NEWS b/Misc/NEWS
index 8bc2d61d5c..5ce766435f 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -84,6 +84,10 @@ C-API
 Library
 -------
 
+- Issue #5006: Better handling of unicode byte-order marks (BOM) in the io
+  library. This means, for example, that opening an UTF-16 text file in append
+  mode doesn't add a BOM at the end of the file if the file isn't empty.
+
 - Issue #3704: cookielib was not properly handling URLs with a / in the
   parameters.
 
-- 
2.50.0