From c4ef4896eac86a6759901c8546e26de4695a1389 Mon Sep 17 00:00:00 2001 From: Ammar Askar Date: Fri, 6 Jul 2018 03:19:08 -0400 Subject: [PATCH] bpo-33899: Make tokenize module mirror end-of-file is end-of-line behavior (GH-7891) Most of the change involves fixing up the test suite, which previously made the assumption that there wouldn't be a new line if the input didn't end in one. Contributed by Ammar Askar. --- Lib/test/test_tokenize.py | 71 ++++++++++++------- Lib/tokenize.py | 10 +++ .../2018-06-24-01-57-14.bpo-33899.IaOcAr.rst | 3 + 3 files changed, 60 insertions(+), 24 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 93e40de96e..f68580ccfb 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,7 +1,8 @@ from test import support from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, - open as tokenize_open, Untokenizer, generate_tokens) + open as tokenize_open, Untokenizer, generate_tokens, + NEWLINE) from io import BytesIO, StringIO import unittest from unittest import TestCase, mock @@ -11,27 +12,51 @@ import os import token +# Converts a source string into a list of textual representation +# of the tokens such as: +# ` NAME 'if' (1, 0) (1, 2)` +# to make writing tests easier. +def stringify_tokens_from_source(token_generator, source_string): + result = [] + num_lines = len(source_string.splitlines()) + missing_trailing_nl = source_string[-1] not in '\r\n' + + for type, token, start, end, line in token_generator: + if type == ENDMARKER: + break + # Ignore the new line on the last line if the input lacks one + if missing_trailing_nl and type == NEWLINE and end[0] == num_lines: + continue + type = tok_name[type] + result.append(f" {type:10} {token!r:13} {start} {end}") + + return result + class TokenizeTest(TestCase): # Tests for the tokenize module. # The tests can be really simple. Given a small fragment of source - # code, print out a table with tokens. The ENDMARKER is omitted for - # brevity. + # code, print out a table with tokens. The ENDMARKER, ENCODING and + # final NEWLINE are omitted for brevity. def check_tokenize(self, s, expected): # Format the tokens in s in a table format. - # The ENDMARKER is omitted. - result = [] + # The ENDMARKER and final NEWLINE are omitted. f = BytesIO(s.encode('utf-8')) - for type, token, start, end, line in tokenize(f.readline): - if type == ENDMARKER: - break - type = tok_name[type] - result.append(f" {type:10} {token!r:13} {start} {end}") + result = stringify_tokens_from_source(tokenize(f.readline), s) + self.assertEqual(result, [" ENCODING 'utf-8' (0, 0) (0, 0)"] + expected.rstrip().splitlines()) + def test_implicit_newline(self): + # Make sure that the tokenizer puts in an implicit NEWLINE + # when the input lacks a trailing new line. + f = BytesIO("x".encode('utf-8')) + tokens = list(tokenize(f.readline)) + self.assertEqual(tokens[-2].type, NEWLINE) + self.assertEqual(tokens[-1].type, ENDMARKER) + def test_basic(self): self.check_tokenize("1 + 1", """\ NUMBER '1' (1, 0) (1, 1) @@ -922,14 +947,9 @@ async def f(): class GenerateTokensTest(TokenizeTest): def check_tokenize(self, s, expected): # Format the tokens in s in a table format. - # The ENDMARKER is omitted. - result = [] + # The ENDMARKER and final NEWLINE are omitted. f = StringIO(s) - for type, token, start, end, line in generate_tokens(f.readline): - if type == ENDMARKER: - break - type = tok_name[type] - result.append(f" {type:10} {token!r:13} {start} {end}") + result = stringify_tokens_from_source(generate_tokens(f.readline), s) self.assertEqual(result, expected.rstrip().splitlines()) @@ -1022,8 +1042,8 @@ class Test_Tokenize(TestCase): else: return b'' - # skip the initial encoding token and the end token - tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1] + # skip the initial encoding token and the end tokens + tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2] expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] self.assertEqual(tokens, expected_tokens, "bytes not decoded with encoding") @@ -1039,8 +1059,8 @@ class Test_Tokenize(TestCase): else: return b'' - # skip the end token - tokens = list(_tokenize(readline, encoding=None))[:-1] + # skip the end tokens + tokens = list(_tokenize(readline, encoding=None))[:-2] expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] self.assertEqual(tokens, expected_tokens, "string not tokenized when encoding is None") @@ -1351,18 +1371,21 @@ class TestTokenize(TestCase): # Test that 500 consequent, one-line defs is OK toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline)) - self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER + self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER + # [-2] is always NEWLINE def assertExactTypeEqual(self, opstr, *optypes): tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline)) num_optypes = len(optypes) - self.assertEqual(len(tokens), 2 + num_optypes) + self.assertEqual(len(tokens), 3 + num_optypes) self.assertEqual(tok_name[tokens[0].exact_type], tok_name[ENCODING]) for i in range(num_optypes): self.assertEqual(tok_name[tokens[i + 1].exact_type], tok_name[optypes[i]]) self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type], + tok_name[token.NEWLINE]) + self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type], tok_name[token.ENDMARKER]) def test_exact_type(self): @@ -1515,7 +1538,7 @@ class TestRoundtrip(TestCase): self.check_roundtrip("if x == 1:\n" " print(x)\n") self.check_roundtrip("# This is a comment\n" - "# This also") + "# This also\n") # Some people use different formatting conventions, which makes # untokenize a little trickier. Note that this test involves trailing diff --git a/Lib/tokenize.py b/Lib/tokenize.py index c78d9f7e9e..fce010bc5e 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -492,8 +492,15 @@ def _tokenize(readline, encoding): # BOM will already have been stripped. encoding = "utf-8" yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') + last_line = b'' + line = b'' while True: # loop over lines in stream try: + # We capture the value of the line variable here because + # readline uses the empty string '' to signal end of input, + # hence `line` itself will always be overwritten at the end + # of this loop. + last_line = line line = readline() except StopIteration: line = b'' @@ -648,6 +655,9 @@ def _tokenize(readline, encoding): (lnum, pos), (lnum, pos+1), line) pos += 1 + # Add an implicit NEWLINE if the input doesn't end in one + if last_line and last_line[-1] not in '\r\n': + yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') for indent in indents[1:]: # pop remaining indent levels yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') diff --git a/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst new file mode 100644 index 0000000000..21c9095993 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst @@ -0,0 +1,3 @@ +Tokenize module now implicitly emits a NEWLINE when provided with input that +does not have a trailing new line. This behavior now matches what the C +tokenizer does internally. Contributed by Ammar Askar. -- 2.40.0