bpo-33899: Make tokenize module mirror end-of-file is end-of-line behavior (GH-7891)

author Ammar Askar <ammar_askar@hotmail.com>

Fri, 6 Jul 2018 07:19:08 +0000 (03:19 -0400)

committer Tal Einat <taleinat+github@gmail.com>

Fri, 6 Jul 2018 07:19:08 +0000 (10:19 +0300)
author Ammar Askar <ammar_askar@hotmail.com>
Fri, 6 Jul 2018 07:19:08 +0000 (03:19 -0400)
committer Tal Einat <taleinat+github@gmail.com>
Fri, 6 Jul 2018 07:19:08 +0000 (10:19 +0300)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 93e40de96e9eb2e04dcbd1bab74e9ad393ac92d2..f68580ccfb7c63c7f42de90c720cfc9cc781fd32 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,7 +1,8 @@
  from test import support
  from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
                       STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
-                     open as tokenize_open, Untokenizer, generate_tokens)
+                     open as tokenize_open, Untokenizer, generate_tokens,
+                     NEWLINE)
  from io import BytesIO, StringIO
  import unittest
  from unittest import TestCase, mock
@@ -11,27 +12,51 @@ import os
  import token
  
  
+# Converts a source string into a list of textual representation
+# of the tokens such as:
+# `    NAME       'if'          (1, 0) (1, 2)`
+# to make writing tests easier.
+def stringify_tokens_from_source(token_generator, source_string):
+    result = []
+    num_lines = len(source_string.splitlines())
+    missing_trailing_nl = source_string[-1] not in '\r\n'
+
+    for type, token, start, end, line in token_generator:
+        if type == ENDMARKER:
+            break
+        # Ignore the new line on the last line if the input lacks one
+        if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
+            continue
+        type = tok_name[type]
+        result.append(f"    {type:10} {token!r:13} {start} {end}")
+
+    return result
+
  class TokenizeTest(TestCase):
      # Tests for the tokenize module.
  
      # The tests can be really simple. Given a small fragment of source
-    # code, print out a table with tokens. The ENDMARKER is omitted for
-    # brevity.
+    # code, print out a table with tokens. The ENDMARKER, ENCODING and
+    # final NEWLINE are omitted for brevity.
  
      def check_tokenize(self, s, expected):
          # Format the tokens in s in a table format.
-        # The ENDMARKER is omitted.
-        result = []
+        # The ENDMARKER and final NEWLINE are omitted.
          f = BytesIO(s.encode('utf-8'))
-        for type, token, start, end, line in tokenize(f.readline):
-            if type == ENDMARKER:
-                break
-            type = tok_name[type]
-            result.append(f"    {type:10} {token!r:13} {start} {end}")
+        result = stringify_tokens_from_source(tokenize(f.readline), s)
+
          self.assertEqual(result,
                           ["    ENCODING   'utf-8'       (0, 0) (0, 0)"] +
                           expected.rstrip().splitlines())
  
+    def test_implicit_newline(self):
+        # Make sure that the tokenizer puts in an implicit NEWLINE
+        # when the input lacks a trailing new line.
+        f = BytesIO("x".encode('utf-8'))
+        tokens = list(tokenize(f.readline))
+        self.assertEqual(tokens[-2].type, NEWLINE)
+        self.assertEqual(tokens[-1].type, ENDMARKER)
+
      def test_basic(self):
          self.check_tokenize("1 + 1", """\
      NUMBER     '1'           (1, 0) (1, 1)
@@ -922,14 +947,9 @@ async def f():
  class GenerateTokensTest(TokenizeTest):
      def check_tokenize(self, s, expected):
          # Format the tokens in s in a table format.
-        # The ENDMARKER is omitted.
-        result = []
+        # The ENDMARKER and final NEWLINE are omitted.
          f = StringIO(s)
-        for type, token, start, end, line in generate_tokens(f.readline):
-            if type == ENDMARKER:
-                break
-            type = tok_name[type]
-            result.append(f"    {type:10} {token!r:13} {start} {end}")
+        result = stringify_tokens_from_source(generate_tokens(f.readline), s)
          self.assertEqual(result, expected.rstrip().splitlines())
  
  
@@ -1022,8 +1042,8 @@ class Test_Tokenize(TestCase):
              else:
                  return b''
  
-        # skip the initial encoding token and the end token
-        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
+        # skip the initial encoding token and the end tokens
+        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
          expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
          self.assertEqual(tokens, expected_tokens,
                           "bytes not decoded with encoding")
@@ -1039,8 +1059,8 @@ class Test_Tokenize(TestCase):
              else:
                  return b''
  
-        # skip the end token
-        tokens = list(_tokenize(readline, encoding=None))[:-1]
+        # skip the end tokens
+        tokens = list(_tokenize(readline, encoding=None))[:-2]
          expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
          self.assertEqual(tokens, expected_tokens,
                           "string not tokenized when encoding is None")
@@ -1351,18 +1371,21 @@ class TestTokenize(TestCase):
  
          # Test that 500 consequent, one-line defs is OK
          toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
-        self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
+        self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
+                                                # [-2] is always NEWLINE
  
      def assertExactTypeEqual(self, opstr, *optypes):
          tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
          num_optypes = len(optypes)
-        self.assertEqual(len(tokens), 2 + num_optypes)
+        self.assertEqual(len(tokens), 3 + num_optypes)
          self.assertEqual(tok_name[tokens[0].exact_type],
                           tok_name[ENCODING])
          for i in range(num_optypes):
              self.assertEqual(tok_name[tokens[i + 1].exact_type],
                               tok_name[optypes[i]])
          self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
+                         tok_name[token.NEWLINE])
+        self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
                           tok_name[token.ENDMARKER])
  
      def test_exact_type(self):
@@ -1515,7 +1538,7 @@ class TestRoundtrip(TestCase):
          self.check_roundtrip("if x == 1:\n"
                               "    print(x)\n")
          self.check_roundtrip("# This is a comment\n"
-                             "# This also")
+                             "# This also\n")
  
          # Some people use different formatting conventions, which makes
          # untokenize a little trickier. Note that this test involves trailing
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index c78d9f7e9ee5af53f235ee4032b8a5e232321c9b..fce010bc5e7aa7f32db477f64b02be8e7ea502df 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -492,8 +492,15 @@ def _tokenize(readline, encoding):
              # BOM will already have been stripped.
              encoding = "utf-8"
          yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
+    last_line = b''
+    line = b''
      while True:                                # loop over lines in stream
          try:
+            # We capture the value of the line variable here because
+            # readline uses the empty string '' to signal end of input,
+            # hence `line` itself will always be overwritten at the end
+            # of this loop.
+            last_line = line
              line = readline()
          except StopIteration:
              line = b''
@@ -648,6 +655,9 @@ def _tokenize(readline, encoding):
                             (lnum, pos), (lnum, pos+1), line)
                  pos += 1
  
+    # Add an implicit NEWLINE if the input doesn't end in one
+    if last_line and last_line[-1] not in '\r\n':
+        yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
      for indent in indents[1:]:                 # pop remaining indent levels
          yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
      yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
diff --git a/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst

new file mode 100644 (file)

index 0000000..21c9095
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst
@@ -0,0 +1,3 @@
+Tokenize module now implicitly emits a NEWLINE when provided with input that
+does not have a trailing new line.  This behavior now matches what the C
+tokenizer does internally.  Contributed by Ammar Askar.
author	Ammar Askar <ammar_askar@hotmail.com>
	Fri, 6 Jul 2018 07:19:08 +0000 (03:19 -0400)
committer	Tal Einat <taleinat+github@gmail.com>
	Fri, 6 Jul 2018 07:19:08 +0000 (10:19 +0300)
Lib/test/test_tokenize.py		patch \| blob \| history
Lib/tokenize.py		patch \| blob \| history
Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst	[new file with mode: 0644]	patch \| blob