]> granicus.if.org Git - python/commitdiff
Issue #5433: Excessive newline detection optimization in IncrementalNewlineDecoder
authorAntoine Pitrou <solipsis@pitrou.net>
Fri, 6 Mar 2009 23:40:56 +0000 (23:40 +0000)
committerAntoine Pitrou <solipsis@pitrou.net>
Fri, 6 Mar 2009 23:40:56 +0000 (23:40 +0000)
Lib/test/test_io.py
Modules/_textio.c

index 3189f9c6deabaaa7dbe4f11c93c2d155ad6c25f8..5fc53eac22b439d88fbd2ee16fddc340cc0e1579 100644 (file)
@@ -1915,6 +1915,19 @@ class IncrementalNewlineDecoderTest(unittest.TestCase):
         decoder = self.IncrementalNewlineDecoder(decoder, translate=True)
         self.check_newline_decoding_utf8(decoder)
 
+    def test_newline_bytes(self):
+        # Issue 5433: Excessive optimization in IncrementalNewlineDecoder
+        def _check(dec):
+            self.assertEquals(dec.newlines, None)
+            self.assertEquals(dec.decode("\u0D00"), "\u0D00")
+            self.assertEquals(dec.newlines, None)
+            self.assertEquals(dec.decode("\u0A00"), "\u0A00")
+            self.assertEquals(dec.newlines, None)
+        dec = self.IncrementalNewlineDecoder(None, translate=False)
+        _check(dec)
+        dec = self.IncrementalNewlineDecoder(None, translate=True)
+        _check(dec)
+
 class CIncrementalNewlineDecoderTest(IncrementalNewlineDecoderTest):
     pass
 
index 145f8eaf9c1cc17296244d22da1b8c5c7228a695..dbed2fdc55b09fa491b74692a18e2888a5d406e3 100644 (file)
@@ -305,22 +305,40 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self,
            for the \r *byte* with the libc's optimized memchr.
            */
         if (seennl == SEEN_LF || seennl == 0) {
-            int has_cr, has_lf;
-            has_lf = (seennl == SEEN_LF) ||
-                    (memchr(in_str, '\n', len * sizeof(Py_UNICODE)) != NULL);
-            has_cr = (memchr(in_str, '\r', len * sizeof(Py_UNICODE)) != NULL);
-            if (has_lf && !has_cr) {
-                only_lf = 1;
-                seennl = SEEN_LF;
-            }
+            only_lf = !(memchr(in_str, '\r', len * sizeof(Py_UNICODE)) != NULL);
         }
 
-        if (!self->translate) {
+        if (only_lf) {
+            /* If not already seen, quick scan for a possible "\n" character.
+               (there's nothing else to be done, even when in translation mode)
+            */
+            if (seennl == 0 &&
+                memchr(in_str, '\n', len * sizeof(Py_UNICODE)) != NULL) {
+                Py_UNICODE *s, *end;
+                s = in_str;
+                end = in_str + len;
+                for (;;) {
+                    Py_UNICODE c;
+                    /* Fast loop for non-control characters */
+                    while (*s > '\n')
+                        s++;
+                    c = *s++;
+                    if (c == '\n') {
+                        seennl |= SEEN_LF;
+                        break;
+                    }
+                    if (s > end)
+                        break;
+                }
+            }
+            /* Finished: we have scanned for newlines, and none of them
+               need translating */
+        }
+        else if (!self->translate) {
             Py_UNICODE *s, *end;
+            /* We have already seen all newline types, no need to scan again */
             if (seennl == SEEN_ALL)
                 goto endscan;
-            if (only_lf)
-                goto endscan;
             s = in_str;
             end = in_str + len;
             for (;;) {
@@ -347,7 +365,7 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self,
         endscan:
             ;
         }
-        else if (!only_lf) {
+        else {
             PyObject *translated = NULL;
             Py_UNICODE *out_str;
             Py_UNICODE *in, *out, *end;