Fix utf-8-sig incremental decoder, which didn't recognise a BOM when the

author Walter Dörwald <walter@livinglogic.de>

Thu, 12 Apr 2007 10:35:00 +0000 (10:35 +0000)

committer Walter Dörwald <walter@livinglogic.de>

Thu, 12 Apr 2007 10:35:00 +0000 (10:35 +0000)
author Walter Dörwald <walter@livinglogic.de>
Thu, 12 Apr 2007 10:35:00 +0000 (10:35 +0000)
committer Walter Dörwald <walter@livinglogic.de>
Thu, 12 Apr 2007 10:35:00 +0000 (10:35 +0000)
diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py

index d751da69c41585778d5a92783381d0c737b094c4..92678d20a8736820740acfde332bdbc2659132c7 100644 (file)
--- a/Lib/encodings/utf_8_sig.py
+++ b/Lib/encodings/utf_8_sig.py
@@ -44,14 +44,19 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
          self.first = True
  
      def _buffer_decode(self, input, errors, final):
-        if self.first and codecs.BOM_UTF8.startswith(input): # might be a BOM
+        if self.first:
              if len(input) < 3:
-                # not enough data to decide if this really is a BOM
-                # => try again on the next call
-                return (u"", 0)
-            (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
-            self.first = False
-            return (output, consumed+3)
+                if codecs.BOM_UTF8.startswith(input):
+                    # not enough data to decide if this really is a BOM
+                    # => try again on the next call
+                    return (u"", 0)
+                else:
+                    self.first = None
+            else:
+                self.first = None
+                if input[:3] == codecs.BOM_UTF8:
+                    (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
+                    return (output, consumed+3)
          return codecs.utf_8_decode(input, errors, final)
  
      def reset(self):
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index 3c800f87771164722672ea83bdfa4633ca9cfa46..038962345d330f9cc88bc2faa05cead44d7b4bb4 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -429,6 +429,11 @@ class UTF8SigTest(ReadTest):
          # SF bug #1601501: check that the codec works with a buffer
          unicode("\xef\xbb\xbf", "utf-8-sig")
  
+    def test_bom(self):
+        d = codecs.getincrementaldecoder("utf-8-sig")()
+        s = u"spam"
+        self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
+
  class EscapeDecodeTest(unittest.TestCase):
      def test_empty(self):
          self.assertEquals(codecs.escape_decode(""), ("", 0))
diff --git a/Misc/NEWS b/Misc/NEWS

index 4370030c66c66f74f0b4023a9c7efc249533728d..db0c8dc41caa1b2402c574c1007c48450b4f7408 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -591,6 +591,8 @@ Library
  
  - idle: Honor the "Cancel" action in the save dialog (Debian bug #299092).
  
+- Fix utf-8-sig incremental decoder, which didn't recognise a BOM when the
+  first chunk fed to the decoder started with a BOM, but was longer than 3 bytes.
  
  Extension Modules
  -----------------
author	Walter Dörwald <walter@livinglogic.de>
	Thu, 12 Apr 2007 10:35:00 +0000 (10:35 +0000)
committer	Walter Dörwald <walter@livinglogic.de>
	Thu, 12 Apr 2007 10:35:00 +0000 (10:35 +0000)
Lib/encodings/utf_8_sig.py		patch \| blob \| history
Lib/test/test_codecs.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history