From be51e43ba2c57b8032286af4e8713485b6dc78c3 Mon Sep 17 00:00:00 2001
From: Gregory Szorc <gregory.szorc@gmail.com>
Date: Thu, 12 Jul 2012 07:21:12 +0000
Subject: [PATCH] [clang.py] Implement Token API

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@160111 91177308-0d34-0410-b5e6-96231b3b80d8
---
 bindings/python/clang/cindex.py               | 198 ++++++++++++++++--
 bindings/python/clang/enumerations.py         |  34 +++
 bindings/python/tests/cindex/test_cursor.py   |  10 +
 .../python/tests/cindex/test_token_kind.py    |  43 ++++
 bindings/python/tests/cindex/test_tokens.py   |  52 +++++
 .../tests/cindex/test_translation_unit.py     |  24 ++-
 6 files changed, 345 insertions(+), 16 deletions(-)
 create mode 100644 bindings/python/clang/enumerations.py
 create mode 100644 bindings/python/tests/cindex/test_token_kind.py
 create mode 100644 bindings/python/tests/cindex/test_tokens.py
diff --git a/bindings/python/clang/cindex.py b/bindings/python/clang/cindex.py
index 1bc7d0bae9..fc0a2a18bb 100644
--- a/bindings/python/clang/cindex.py
+++ b/bindings/python/clang/cindex.py
@@ -65,6 +65,8 @@ call is efficient.
 from ctypes import *
 import collections
 
+import clang.enumerations
+
 def get_cindex_library():
     # FIXME: It's probably not the case that the library is actually found in
     # this location. We need a better system of identifying and loading the
@@ -367,6 +369,98 @@ class FixIt(object):
     def __repr__(self):
         return "<FixIt range %r, value %r>" % (self.range, self.value)
 
+class TokenGroup(object):
+    """Helper class to facilitate token management.
+
+    Tokens are allocated from libclang in chunks. They must be disposed of as a
+    collective group.
+
+    One purpose of this class is for instances to represent groups of allocated
+    tokens. Each token in a group contains a reference back to an instance of
+    this class. When all tokens from a group are garbage collected, it allows
+    this class to be garbage collected. When this class is garbage collected,
+    it calls the libclang destructor which invalidates all tokens in the group.
+
+    You should not instantiate this class outside of this module.
+    """
+    def __init__(self, tu, memory, count):
+        self._tu = tu
+        self._memory = memory
+        self._count = count
+
+    def __del__(self):
+        lib.clang_disposeTokens(self._tu, self._memory, self._count)
+
+    @staticmethod
+    def get_tokens(tu, extent):
+        """Helper method to return all tokens in an extent.
+
+        This functionality is needed multiple places in this module. We define
+        it here because it seems like a logical place.
+        """
+        tokens_memory = POINTER(Token)()
+        tokens_count = c_uint()
+
+        lib.clang_tokenize(tu, extent, byref(tokens_memory),
+                byref(tokens_count))
+
+        count = int(tokens_count.value)
+
+        # If we get no tokens, no memory was allocated. Be sure not to return
+        # anything and potentially call a destructor on nothing.
+        if count < 1:
+            return
+
+        tokens_array = cast(tokens_memory, POINTER(Token * count)).contents
+
+        token_group = TokenGroup(tu, tokens_memory, tokens_count)
+
+        for i in xrange(0, count):
+            token = Token()
+            token.int_data = tokens_array[i].int_data
+            token.ptr_data = tokens_array[i].ptr_data
+            token._tu = tu
+            token._group = token_group
+
+            yield token
+
+class TokenKind(object):
+    """Describes a specific type of a Token."""
+
+    _value_map = {} # int -> TokenKind
+
+    def __init__(self, value, name):
+        """Create a new TokenKind instance from a numeric value and a name."""
+        self.value = value
+        self.name = name
+
+    def __repr__(self):
+        return 'TokenKind.%s' % (self.name,)
+
+    @staticmethod
+    def from_value(value):
+        """Obtain a registered TokenKind instance from its value."""
+        result = TokenKind._value_map.get(value, None)
+
+        if result is None:
+            raise ValueError('Unknown TokenKind: %d' % value)
+
+        return result
+
+    @staticmethod
+    def register(value, name):
+        """Register a new TokenKind enumeration.
+
+        This should only be called at module load time by code within this
+        package.
+        """
+        if value in TokenKind._value_map:
+            raise ValueError('TokenKind already registered: %d' % value)
+
+        kind = TokenKind(value, name)
+        TokenKind._value_map[value] = kind
+        setattr(TokenKind, name, kind)
+
 ### Cursor Kinds ###
 
 class CursorKind(object):
@@ -1181,6 +1275,14 @@ class Cursor(Structure):
             children)
         return iter(children)
 
+    def get_tokens(self):
+        """Obtain Token instances formulating that compose this Cursor.
+
+        This is a generator for Token instances. It returns all tokens which
+        occupy the extent this cursor occupies.
+        """
+        return TokenGroup.get_tokens(self._tu, self.extent)
+
     @staticmethod
     def from_result(res, fn, args):
         assert isinstance(res, Cursor)
@@ -2058,6 +2160,19 @@ class TranslationUnit(ClangObject):
             return CodeCompletionResults(ptr)
         return None
 
+    def get_tokens(self, locations=None, extent=None):
+        """Obtain tokens in this translation unit.
+
+        This is a generator for Token instances. The caller specifies a range
+        of source code to obtain tokens for. The range can be specified as a
+        2-tuple of SourceLocation or as a SourceRange. If both are defined,
+        behavior is undefined.
+        """
+        if locations is not None:
+            extent = SourceRange(start=locations[0], end=locations[1])
+
+        return TokenGroup.get_tokens(self, extent)
+
 class File(ClangObject):
     """
     The File class represents a particular source file that is part of a
@@ -2226,6 +2341,52 @@ class CompilationDatabase(ClangObject):
         """
         return lib.clang_CompilationDatabase_getCompileCommands(self, filename)
 
+class Token(Structure):
+    """Represents a single token from the preprocessor.
+
+    Tokens are effectively segments of source code. Source code is first parsed
+    into tokens before being converted into the AST and Cursors.
+
+    Tokens are obtained from parsed TranslationUnit instances. You currently
+    can't create tokens manually.
+    """
+    _fields_ = [
+        ('int_data', c_uint * 4),
+        ('ptr_data', c_void_p)
+    ]
+
+    @property
+    def spelling(self):
+        """The spelling of this token.
+
+        This is the textual representation of the token in source.
+        """
+        return lib.clang_getTokenSpelling(self._tu, self)
+
+    @property
+    def kind(self):
+        """Obtain the TokenKind of the current token."""
+        return TokenKind.from_value(lib.clang_getTokenKind(self))
+
+    @property
+    def location(self):
+        """The SourceLocation this Token occurs at."""
+        return lib.clang_getTokenLocation(self._tu, self)
+
+    @property
+    def extent(self):
+        """The SourceRange this Token occupies."""
+        return lib.clang_getTokenExtent(self._tu, self)
+
+    @property
+    def cursor(self):
+        """The Cursor this Token corresponds to."""
+        cursor = Cursor()
+
+        lib.clang_annotateTokens(self._tu, byref(self), 1, byref(cursor))
+
+        return cursor
+
 # Now comes the plumbing to hook up the C library.
 
 # Register callback types in common container.
@@ -2240,8 +2401,8 @@ def register_functions(lib):
     to call out to the shared library.
     """
     # Functions are registered in strictly alphabetical order.
-    #lib.clang_annotateTokens.argtype = [TranslationUnit, POINTER(Token),
-    #                                    c_uint, POINTER(Cursor)]
+    lib.clang_annotateTokens.argtype = [TranslationUnit, POINTER(Token),
+                                        c_uint, POINTER(Cursor)]
 
     lib.clang_CompilationDatabase_dispose.argtypes = [c_object_p]
 
@@ -2309,7 +2470,7 @@ def register_functions(lib):
 
     lib.clang_disposeString.argtypes = [_CXString]
 
-    #lib.clang_disposeTokens.argtype = [TranslationUnit, POINTER(Token), c_uint]
+    lib.clang_disposeTokens.argtype = [TranslationUnit, POINTER(Token), c_uint]
 
     lib.clang_disposeTranslationUnit.argtypes = [TranslationUnit]
 
@@ -2543,19 +2704,18 @@ def register_functions(lib):
     lib.clang_getTemplateCursorKind.argtypes = [Cursor]
     lib.clang_getTemplateCursorKind.restype = c_uint
 
-    #lib.clang_getTokenExtent.argtypes = [TranslationUnit, Token]
-    #lib.clang_getTokenExtent.restype = SourceRange
+    lib.clang_getTokenExtent.argtypes = [TranslationUnit, Token]
+    lib.clang_getTokenExtent.restype = SourceRange
 
-    #lib.clang_getTokenKind.argtypes = [Token]
-    #lib.clang_getTokenKind.restype = c_uint
-    #lib.clang_getTokenKind.errcheck = TokenKind.from_result
+    lib.clang_getTokenKind.argtypes = [Token]
+    lib.clang_getTokenKind.restype = c_uint
 
-    #lib.clang_getTokenLocation.argtype = [TranslationUnit, Token]
-    #lib.clang_getTokenLocation.restype = SourceLocation
+    lib.clang_getTokenLocation.argtype = [TranslationUnit, Token]
+    lib.clang_getTokenLocation.restype = SourceLocation
 
-    #lib.clang_getTokenSpelling.argtype = [TranslationUnit, Token]
-    #lib.clang_getTokenSpelling.restype = _CXString
-    #lib.clang_getTokenSpelling.errcheck = _CXString.from_result
+    lib.clang_getTokenSpelling.argtype = [TranslationUnit, Token]
+    lib.clang_getTokenSpelling.restype = _CXString
+    lib.clang_getTokenSpelling.errcheck = _CXString.from_result
 
     lib.clang_getTranslationUnitCursor.argtypes = [TranslationUnit]
     lib.clang_getTranslationUnitCursor.restype = Cursor
@@ -2646,8 +2806,8 @@ def register_functions(lib):
         c_uint]
     lib.clang_saveTranslationUnit.restype = c_int
 
-    #lib.clang_tokenize.argtypes = [TranslationUnit, SourceRange,
-    #    POINTER(POINTER(Token)), POINTER(c_uint)]
+    lib.clang_tokenize.argtypes = [TranslationUnit, SourceRange,
+        POINTER(POINTER(Token)), POINTER(c_uint)]
 
     lib.clang_visitChildren.argtypes = [Cursor, callbacks['cursor_visit'],
         py_object]
@@ -2655,6 +2815,12 @@ def register_functions(lib):
 
 register_functions(lib)
 
+def register_enumerations():
+    for name, value in clang.enumerations.TokenKinds:
+        TokenKind.register(value, name)
+
+register_enumerations()
+
 __all__ = [
     'CodeCompletionResults',
     'CompilationDatabase',
@@ -2668,6 +2834,8 @@ __all__ = [
     'Index',
     'SourceLocation',
     'SourceRange',
+    'TokenKind',
+    'Token',
     'TranslationUnitLoadError',
     'TranslationUnit',
     'TypeKind',
diff --git a/bindings/python/clang/enumerations.py b/bindings/python/clang/enumerations.py
new file mode 100644
index 0000000000..a86a48ade3
--- /dev/null
+++ b/bindings/python/clang/enumerations.py
@@ -0,0 +1,34 @@
+#===- enumerations.py - Python Enumerations ------------------*- python -*--===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===------------------------------------------------------------------------===#
+
+"""
+Clang Enumerations
+==================
+
+This module provides static definitions of enumerations that exist in libclang.
+
+Enumerations are typically defined as a list of tuples. The exported values are
+typically munged into other types or classes at module load time.
+
+All enumerations are centrally defined in this file so they are all grouped
+together and easier to audit. And, maybe even one day this file will be
+automatically generated by scanning the libclang headers!
+"""
+
+# Maps to CXTokenKind. Note that libclang maintains a separate set of token
+# enumerations from the C++ API.
+TokenKinds = [
+    ('PUNCTUATION', 0),
+    ('KEYWORD', 1),
+    ('IDENTIFIER', 2),
+    ('LITERAL', 3),
+    ('COMMENT', 4),
+]
+
+__all__ = ['TokenKinds']
diff --git a/bindings/python/tests/cindex/test_cursor.py b/bindings/python/tests/cindex/test_cursor.py
index 979838b21c..51695e20b0 100644
--- a/bindings/python/tests/cindex/test_cursor.py
+++ b/bindings/python/tests/cindex/test_cursor.py
@@ -231,3 +231,13 @@ def test_result_type():
     assert foo is not None
     t = foo.result_type
     assert t.kind == TypeKind.INT
+
+def test_get_tokens():
+    """Ensure we can map cursors back to tokens."""
+    tu = get_tu('int foo(int i);')
+    foo = get_cursor(tu, 'foo')
+
+    tokens = list(foo.get_tokens())
+    assert len(tokens) == 7
+    assert tokens[0].spelling == 'int'
+    assert tokens[1].spelling == 'foo'
diff --git a/bindings/python/tests/cindex/test_token_kind.py b/bindings/python/tests/cindex/test_token_kind.py
new file mode 100644
index 0000000000..62ec63e0ad
--- /dev/null
+++ b/bindings/python/tests/cindex/test_token_kind.py
@@ -0,0 +1,43 @@
+from clang.cindex import TokenKind
+from nose.tools import eq_
+from nose.tools import ok_
+from nose.tools import raises
+
+def test_constructor():
+    """Ensure TokenKind constructor works as expected."""
+
+    t = TokenKind(5, 'foo')
+
+    eq_(t.value, 5)
+    eq_(t.name, 'foo')
+
+@raises(ValueError)
+def test_bad_register():
+    """Ensure a duplicate value is rejected for registration."""
+
+    TokenKind.register(2, 'foo')
+
+@raises(ValueError)
+def test_unknown_value():
+    """Ensure trying to fetch an unknown value raises."""
+
+    TokenKind.from_value(-1)
+
+def test_registration():
+    """Ensure that items registered appear as class attributes."""
+    ok_(hasattr(TokenKind, 'LITERAL'))
+    literal = TokenKind.LITERAL
+
+    ok_(isinstance(literal, TokenKind))
+
+def test_from_value():
+    """Ensure registered values can be obtained from from_value()."""
+    t = TokenKind.from_value(3)
+    ok_(isinstance(t, TokenKind))
+    eq_(t, TokenKind.LITERAL)
+
+def test_repr():
+    """Ensure repr() works."""
+
+    r = repr(TokenKind.LITERAL)
+    eq_(r, 'TokenKind.LITERAL')
diff --git a/bindings/python/tests/cindex/test_tokens.py b/bindings/python/tests/cindex/test_tokens.py
new file mode 100644
index 0000000000..7074842909
--- /dev/null
+++ b/bindings/python/tests/cindex/test_tokens.py
@@ -0,0 +1,52 @@
+from clang.cindex import CursorKind
+from clang.cindex import Index
+from clang.cindex import SourceLocation
+from clang.cindex import SourceRange
+from clang.cindex import TokenKind
+from nose.tools import eq_
+from nose.tools import ok_
+
+from .util import get_tu
+
+def test_token_to_cursor():
+    """Ensure we can obtain a Cursor from a Token instance."""
+    tu = get_tu('int i = 5;')
+    r = tu.get_extent('t.c', (0, 9))
+    tokens = list(tu.get_tokens(extent=r))
+
+    assert len(tokens) == 5
+    assert tokens[1].spelling == 'i'
+    assert tokens[1].kind == TokenKind.IDENTIFIER
+
+    cursor = tokens[1].cursor
+    assert cursor.kind == CursorKind.VAR_DECL
+    assert tokens[1].cursor == tokens[2].cursor
+
+def test_token_location():
+    """Ensure Token.location works."""
+
+    tu = get_tu('int foo = 10;')
+    r = tu.get_extent('t.c', (0, 11))
+
+    tokens = list(tu.get_tokens(extent=r))
+    eq_(len(tokens), 4)
+
+    loc = tokens[1].location
+    ok_(isinstance(loc, SourceLocation))
+    eq_(loc.line, 1)
+    eq_(loc.column, 5)
+    eq_(loc.offset, 4)
+
+def test_token_extent():
+    """Ensure Token.extent works."""
+    tu = get_tu('int foo = 10;')
+    r = tu.get_extent('t.c', (0, 11))
+
+    tokens = list(tu.get_tokens(extent=r))
+    eq_(len(tokens), 4)
+
+    extent = tokens[1].extent
+    ok_(isinstance(extent, SourceRange))
+
+    eq_(extent.start.offset, 4)
+    eq_(extent.end.offset, 7)
diff --git a/bindings/python/tests/cindex/test_translation_unit.py b/bindings/python/tests/cindex/test_translation_unit.py
index 9de12ad462..c91f126097 100644
--- a/bindings/python/tests/cindex/test_translation_unit.py
+++ b/bindings/python/tests/cindex/test_translation_unit.py
@@ -1,3 +1,6 @@
+import gc
+import os
+
 from clang.cindex import CursorKind
 from clang.cindex import Cursor
 from clang.cindex import File
@@ -8,7 +11,6 @@ from clang.cindex import TranslationUnitSaveError
 from clang.cindex import TranslationUnit
 from .util import get_cursor
 from .util import get_tu
-import os
 
 kInputsDir = os.path.join(os.path.dirname(__file__), 'INPUTS')
 
@@ -217,3 +219,23 @@ def test_get_source_range():
     assert r.end.offset == 5
     assert r.start.file.name == 't.c'
     assert r.end.file.name == 't.c'
+
+def test_get_tokens_gc():
+    """Ensures get_tokens() works properly with garbage collection."""
+
+    tu = get_tu('int foo();')
+    r = tu.get_extent('t.c', (0, 10))
+    tokens = list(tu.get_tokens(extent=r))
+
+    assert tokens[0].spelling == 'int'
+    gc.collect()
+    assert tokens[0].spelling == 'int'
+
+    del tokens[1]
+    gc.collect()
+    assert tokens[0].spelling == 'int'
+
+    # May trigger segfault if we don't do our job properly.
+    del tokens
+    gc.collect()
+    gc.collect() # Just in case.
-- 
2.40.0