decoder = self.IncrementalNewlineDecoder(decoder, translate=True)
self.check_newline_decoding_utf8(decoder)
+ def test_newline_bytes(self):
+ # Issue 5433: Excessive optimization in IncrementalNewlineDecoder
+ def _check(dec):
+ self.assertEquals(dec.newlines, None)
+ self.assertEquals(dec.decode("\u0D00"), "\u0D00")
+ self.assertEquals(dec.newlines, None)
+ self.assertEquals(dec.decode("\u0A00"), "\u0A00")
+ self.assertEquals(dec.newlines, None)
+ dec = self.IncrementalNewlineDecoder(None, translate=False)
+ _check(dec)
+ dec = self.IncrementalNewlineDecoder(None, translate=True)
+ _check(dec)
+
class CIncrementalNewlineDecoderTest(IncrementalNewlineDecoderTest):
pass
for the \r *byte* with the libc's optimized memchr.
*/
if (seennl == SEEN_LF || seennl == 0) {
- int has_cr, has_lf;
- has_lf = (seennl == SEEN_LF) ||
- (memchr(in_str, '\n', len * sizeof(Py_UNICODE)) != NULL);
- has_cr = (memchr(in_str, '\r', len * sizeof(Py_UNICODE)) != NULL);
- if (has_lf && !has_cr) {
- only_lf = 1;
- seennl = SEEN_LF;
- }
+ only_lf = !(memchr(in_str, '\r', len * sizeof(Py_UNICODE)) != NULL);
}
- if (!self->translate) {
+ if (only_lf) {
+ /* If not already seen, quick scan for a possible "\n" character.
+ (there's nothing else to be done, even when in translation mode)
+ */
+ if (seennl == 0 &&
+ memchr(in_str, '\n', len * sizeof(Py_UNICODE)) != NULL) {
+ Py_UNICODE *s, *end;
+ s = in_str;
+ end = in_str + len;
+ for (;;) {
+ Py_UNICODE c;
+ /* Fast loop for non-control characters */
+ while (*s > '\n')
+ s++;
+ c = *s++;
+ if (c == '\n') {
+ seennl |= SEEN_LF;
+ break;
+ }
+ if (s > end)
+ break;
+ }
+ }
+ /* Finished: we have scanned for newlines, and none of them
+ need translating */
+ }
+ else if (!self->translate) {
Py_UNICODE *s, *end;
+ /* We have already seen all newline types, no need to scan again */
if (seennl == SEEN_ALL)
goto endscan;
- if (only_lf)
- goto endscan;
s = in_str;
end = in_str + len;
for (;;) {
endscan:
;
}
- else if (!only_lf) {
+ else {
PyObject *translated = NULL;
Py_UNICODE *out_str;
Py_UNICODE *in, *out, *end;