bpo-12486: Document tokenize.generate_tokens() as public API (#6957)

author Thomas Kluyver <takowl@gmail.com>

Tue, 5 Jun 2018 17:26:39 +0000 (19:26 +0200)

committer Carol Willing <carolcode@willingconsulting.com>

Tue, 5 Jun 2018 17:26:39 +0000 (10:26 -0700)
author Thomas Kluyver <takowl@gmail.com>
Tue, 5 Jun 2018 17:26:39 +0000 (19:26 +0200)
committer Carol Willing <carolcode@willingconsulting.com>
Tue, 5 Jun 2018 17:26:39 +0000 (10:26 -0700)
diff --git a/Doc/library/tokenize.rst b/Doc/library/tokenize.rst

index 4c0a0ceef7dc4e7bfac203d4dca8937bac34992f..111289c767f35c3e7be2fbb2f8571eb08c8ce79a 100644 (file)
--- a/Doc/library/tokenize.rst
+++ b/Doc/library/tokenize.rst
@@ -57,6 +57,16 @@ The primary entry point is a :term:`generator`:
     :func:`.tokenize` determines the source encoding of the file by looking for a
     UTF-8 BOM or encoding cookie, according to :pep:`263`.
  
+.. function:: generate_tokens(readline)
+
+   Tokenize a source reading unicode strings instead of bytes.
+
+   Like :func:`.tokenize`, the *readline* argument is a callable returning
+   a single line of input. However, :func:`generate_tokens` expects *readline*
+   to return a str object rather than bytes.
+
+   The result is an iterator yielding named tuples, exactly like
+   :func:`.tokenize`. It does not yield an :data:`~token.ENCODING` token.
  
  All constants from the :mod:`token` module are also exported from
  :mod:`tokenize`.
@@ -79,7 +89,8 @@ write back the modified script.
      positions) may change.
  
      It returns bytes, encoded using the :data:`~token.ENCODING` token, which
-    is the first token sequence output by :func:`.tokenize`.
+    is the first token sequence output by :func:`.tokenize`. If there is no
+    encoding token in the input, it returns a str instead.
  
  
  :func:`.tokenize` needs to detect the encoding of source files it tokenizes. The
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 3520a67bd42b111fc3f42ca50399e90106617567..93e40de96e9eb2e04dcbd1bab74e9ad393ac92d2 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,8 +1,8 @@
  from test import support
  from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
                       STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
-                     open as tokenize_open, Untokenizer)
-from io import BytesIO
+                     open as tokenize_open, Untokenizer, generate_tokens)
+from io import BytesIO, StringIO
  import unittest
  from unittest import TestCase, mock
  from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
@@ -919,6 +919,19 @@ async def f():
      DEDENT     ''            (7, 0) (7, 0)
      """)
  
+class GenerateTokensTest(TokenizeTest):
+    def check_tokenize(self, s, expected):
+        # Format the tokens in s in a table format.
+        # The ENDMARKER is omitted.
+        result = []
+        f = StringIO(s)
+        for type, token, start, end, line in generate_tokens(f.readline):
+            if type == ENDMARKER:
+                break
+            type = tok_name[type]
+            result.append(f"    {type:10} {token!r:13} {start} {end}")
+        self.assertEqual(result, expected.rstrip().splitlines())
+
  
  def decistmt(s):
      result = []
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index 40e6a8b9297b242aa8294926ad8b2da4d342414f..c78d9f7e9ee5af53f235ee4032b8a5e232321c9b 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -37,7 +37,7 @@ cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
  blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
  
  import token
-__all__ = token.__all__ + ["tokenize", "detect_encoding",
+__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
                             "untokenize", "TokenInfo"]
  del token
  
@@ -653,9 +653,12 @@ def _tokenize(readline, encoding):
      yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
  
  
-# An undocumented, backwards compatible, API for all the places in the standard
-# library that expect to be able to use tokenize with strings
  def generate_tokens(readline):
+    """Tokenize a source reading Python code as unicode strings.
+
+    This has the same API as tokenize(), except that it expects the *readline*
+    callable to return str objects instead of bytes.
+    """
      return _tokenize(readline, None)
  
  def main():
diff --git a/Misc/NEWS.d/next/Library/2018-05-17-22-14-58.bpo-12486.HBeh62.rst b/Misc/NEWS.d/next/Library/2018-05-17-22-14-58.bpo-12486.HBeh62.rst

new file mode 100644 (file)

index 0000000..89c88e2
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-05-17-22-14-58.bpo-12486.HBeh62.rst
@@ -0,0 +1,2 @@
+:func:`tokenize.generate_tokens` is now documented as a public API to
+tokenize unicode strings. It was previously present but undocumented.
author	Thomas Kluyver <takowl@gmail.com>
	Tue, 5 Jun 2018 17:26:39 +0000 (19:26 +0200)
committer	Carol Willing <carolcode@willingconsulting.com>
	Tue, 5 Jun 2018 17:26:39 +0000 (10:26 -0700)
Doc/library/tokenize.rst		patch \| blob \| history
Lib/test/test_tokenize.py		patch \| blob \| history
Lib/tokenize.py		patch \| blob \| history
Misc/NEWS.d/next/Library/2018-05-17-22-14-58.bpo-12486.HBeh62.rst	[new file with mode: 0644]	patch \| blob