From be51e43ba2c57b8032286af4e8713485b6dc78c3 Mon Sep 17 00:00:00 2001 From: Gregory Szorc Date: Thu, 12 Jul 2012 07:21:12 +0000 Subject: [PATCH] [clang.py] Implement Token API git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@160111 91177308-0d34-0410-b5e6-96231b3b80d8 --- bindings/python/clang/cindex.py | 198 ++++++++++++++++-- bindings/python/clang/enumerations.py | 34 +++ bindings/python/tests/cindex/test_cursor.py | 10 + .../python/tests/cindex/test_token_kind.py | 43 ++++ bindings/python/tests/cindex/test_tokens.py | 52 +++++ .../tests/cindex/test_translation_unit.py | 24 ++- 6 files changed, 345 insertions(+), 16 deletions(-) create mode 100644 bindings/python/clang/enumerations.py create mode 100644 bindings/python/tests/cindex/test_token_kind.py create mode 100644 bindings/python/tests/cindex/test_tokens.py diff --git a/bindings/python/clang/cindex.py b/bindings/python/clang/cindex.py index 1bc7d0bae9..fc0a2a18bb 100644 --- a/bindings/python/clang/cindex.py +++ b/bindings/python/clang/cindex.py @@ -65,6 +65,8 @@ call is efficient. from ctypes import * import collections +import clang.enumerations + def get_cindex_library(): # FIXME: It's probably not the case that the library is actually found in # this location. We need a better system of identifying and loading the @@ -367,6 +369,98 @@ class FixIt(object): def __repr__(self): return "" % (self.range, self.value) +class TokenGroup(object): + """Helper class to facilitate token management. + + Tokens are allocated from libclang in chunks. They must be disposed of as a + collective group. + + One purpose of this class is for instances to represent groups of allocated + tokens. Each token in a group contains a reference back to an instance of + this class. When all tokens from a group are garbage collected, it allows + this class to be garbage collected. When this class is garbage collected, + it calls the libclang destructor which invalidates all tokens in the group. + + You should not instantiate this class outside of this module. + """ + def __init__(self, tu, memory, count): + self._tu = tu + self._memory = memory + self._count = count + + def __del__(self): + lib.clang_disposeTokens(self._tu, self._memory, self._count) + + @staticmethod + def get_tokens(tu, extent): + """Helper method to return all tokens in an extent. + + This functionality is needed multiple places in this module. We define + it here because it seems like a logical place. + """ + tokens_memory = POINTER(Token)() + tokens_count = c_uint() + + lib.clang_tokenize(tu, extent, byref(tokens_memory), + byref(tokens_count)) + + count = int(tokens_count.value) + + # If we get no tokens, no memory was allocated. Be sure not to return + # anything and potentially call a destructor on nothing. + if count < 1: + return + + tokens_array = cast(tokens_memory, POINTER(Token * count)).contents + + token_group = TokenGroup(tu, tokens_memory, tokens_count) + + for i in xrange(0, count): + token = Token() + token.int_data = tokens_array[i].int_data + token.ptr_data = tokens_array[i].ptr_data + token._tu = tu + token._group = token_group + + yield token + +class TokenKind(object): + """Describes a specific type of a Token.""" + + _value_map = {} # int -> TokenKind + + def __init__(self, value, name): + """Create a new TokenKind instance from a numeric value and a name.""" + self.value = value + self.name = name + + def __repr__(self): + return 'TokenKind.%s' % (self.name,) + + @staticmethod + def from_value(value): + """Obtain a registered TokenKind instance from its value.""" + result = TokenKind._value_map.get(value, None) + + if result is None: + raise ValueError('Unknown TokenKind: %d' % value) + + return result + + @staticmethod + def register(value, name): + """Register a new TokenKind enumeration. + + This should only be called at module load time by code within this + package. + """ + if value in TokenKind._value_map: + raise ValueError('TokenKind already registered: %d' % value) + + kind = TokenKind(value, name) + TokenKind._value_map[value] = kind + setattr(TokenKind, name, kind) + ### Cursor Kinds ### class CursorKind(object): @@ -1181,6 +1275,14 @@ class Cursor(Structure): children) return iter(children) + def get_tokens(self): + """Obtain Token instances formulating that compose this Cursor. + + This is a generator for Token instances. It returns all tokens which + occupy the extent this cursor occupies. + """ + return TokenGroup.get_tokens(self._tu, self.extent) + @staticmethod def from_result(res, fn, args): assert isinstance(res, Cursor) @@ -2058,6 +2160,19 @@ class TranslationUnit(ClangObject): return CodeCompletionResults(ptr) return None + def get_tokens(self, locations=None, extent=None): + """Obtain tokens in this translation unit. + + This is a generator for Token instances. The caller specifies a range + of source code to obtain tokens for. The range can be specified as a + 2-tuple of SourceLocation or as a SourceRange. If both are defined, + behavior is undefined. + """ + if locations is not None: + extent = SourceRange(start=locations[0], end=locations[1]) + + return TokenGroup.get_tokens(self, extent) + class File(ClangObject): """ The File class represents a particular source file that is part of a @@ -2226,6 +2341,52 @@ class CompilationDatabase(ClangObject): """ return lib.clang_CompilationDatabase_getCompileCommands(self, filename) +class Token(Structure): + """Represents a single token from the preprocessor. + + Tokens are effectively segments of source code. Source code is first parsed + into tokens before being converted into the AST and Cursors. + + Tokens are obtained from parsed TranslationUnit instances. You currently + can't create tokens manually. + """ + _fields_ = [ + ('int_data', c_uint * 4), + ('ptr_data', c_void_p) + ] + + @property + def spelling(self): + """The spelling of this token. + + This is the textual representation of the token in source. + """ + return lib.clang_getTokenSpelling(self._tu, self) + + @property + def kind(self): + """Obtain the TokenKind of the current token.""" + return TokenKind.from_value(lib.clang_getTokenKind(self)) + + @property + def location(self): + """The SourceLocation this Token occurs at.""" + return lib.clang_getTokenLocation(self._tu, self) + + @property + def extent(self): + """The SourceRange this Token occupies.""" + return lib.clang_getTokenExtent(self._tu, self) + + @property + def cursor(self): + """The Cursor this Token corresponds to.""" + cursor = Cursor() + + lib.clang_annotateTokens(self._tu, byref(self), 1, byref(cursor)) + + return cursor + # Now comes the plumbing to hook up the C library. # Register callback types in common container. @@ -2240,8 +2401,8 @@ def register_functions(lib): to call out to the shared library. """ # Functions are registered in strictly alphabetical order. - #lib.clang_annotateTokens.argtype = [TranslationUnit, POINTER(Token), - # c_uint, POINTER(Cursor)] + lib.clang_annotateTokens.argtype = [TranslationUnit, POINTER(Token), + c_uint, POINTER(Cursor)] lib.clang_CompilationDatabase_dispose.argtypes = [c_object_p] @@ -2309,7 +2470,7 @@ def register_functions(lib): lib.clang_disposeString.argtypes = [_CXString] - #lib.clang_disposeTokens.argtype = [TranslationUnit, POINTER(Token), c_uint] + lib.clang_disposeTokens.argtype = [TranslationUnit, POINTER(Token), c_uint] lib.clang_disposeTranslationUnit.argtypes = [TranslationUnit] @@ -2543,19 +2704,18 @@ def register_functions(lib): lib.clang_getTemplateCursorKind.argtypes = [Cursor] lib.clang_getTemplateCursorKind.restype = c_uint - #lib.clang_getTokenExtent.argtypes = [TranslationUnit, Token] - #lib.clang_getTokenExtent.restype = SourceRange + lib.clang_getTokenExtent.argtypes = [TranslationUnit, Token] + lib.clang_getTokenExtent.restype = SourceRange - #lib.clang_getTokenKind.argtypes = [Token] - #lib.clang_getTokenKind.restype = c_uint - #lib.clang_getTokenKind.errcheck = TokenKind.from_result + lib.clang_getTokenKind.argtypes = [Token] + lib.clang_getTokenKind.restype = c_uint - #lib.clang_getTokenLocation.argtype = [TranslationUnit, Token] - #lib.clang_getTokenLocation.restype = SourceLocation + lib.clang_getTokenLocation.argtype = [TranslationUnit, Token] + lib.clang_getTokenLocation.restype = SourceLocation - #lib.clang_getTokenSpelling.argtype = [TranslationUnit, Token] - #lib.clang_getTokenSpelling.restype = _CXString - #lib.clang_getTokenSpelling.errcheck = _CXString.from_result + lib.clang_getTokenSpelling.argtype = [TranslationUnit, Token] + lib.clang_getTokenSpelling.restype = _CXString + lib.clang_getTokenSpelling.errcheck = _CXString.from_result lib.clang_getTranslationUnitCursor.argtypes = [TranslationUnit] lib.clang_getTranslationUnitCursor.restype = Cursor @@ -2646,8 +2806,8 @@ def register_functions(lib): c_uint] lib.clang_saveTranslationUnit.restype = c_int - #lib.clang_tokenize.argtypes = [TranslationUnit, SourceRange, - # POINTER(POINTER(Token)), POINTER(c_uint)] + lib.clang_tokenize.argtypes = [TranslationUnit, SourceRange, + POINTER(POINTER(Token)), POINTER(c_uint)] lib.clang_visitChildren.argtypes = [Cursor, callbacks['cursor_visit'], py_object] @@ -2655,6 +2815,12 @@ def register_functions(lib): register_functions(lib) +def register_enumerations(): + for name, value in clang.enumerations.TokenKinds: + TokenKind.register(value, name) + +register_enumerations() + __all__ = [ 'CodeCompletionResults', 'CompilationDatabase', @@ -2668,6 +2834,8 @@ __all__ = [ 'Index', 'SourceLocation', 'SourceRange', + 'TokenKind', + 'Token', 'TranslationUnitLoadError', 'TranslationUnit', 'TypeKind', diff --git a/bindings/python/clang/enumerations.py b/bindings/python/clang/enumerations.py new file mode 100644 index 0000000000..a86a48ade3 --- /dev/null +++ b/bindings/python/clang/enumerations.py @@ -0,0 +1,34 @@ +#===- enumerations.py - Python Enumerations ------------------*- python -*--===# +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +#===------------------------------------------------------------------------===# + +""" +Clang Enumerations +================== + +This module provides static definitions of enumerations that exist in libclang. + +Enumerations are typically defined as a list of tuples. The exported values are +typically munged into other types or classes at module load time. + +All enumerations are centrally defined in this file so they are all grouped +together and easier to audit. And, maybe even one day this file will be +automatically generated by scanning the libclang headers! +""" + +# Maps to CXTokenKind. Note that libclang maintains a separate set of token +# enumerations from the C++ API. +TokenKinds = [ + ('PUNCTUATION', 0), + ('KEYWORD', 1), + ('IDENTIFIER', 2), + ('LITERAL', 3), + ('COMMENT', 4), +] + +__all__ = ['TokenKinds'] diff --git a/bindings/python/tests/cindex/test_cursor.py b/bindings/python/tests/cindex/test_cursor.py index 979838b21c..51695e20b0 100644 --- a/bindings/python/tests/cindex/test_cursor.py +++ b/bindings/python/tests/cindex/test_cursor.py @@ -231,3 +231,13 @@ def test_result_type(): assert foo is not None t = foo.result_type assert t.kind == TypeKind.INT + +def test_get_tokens(): + """Ensure we can map cursors back to tokens.""" + tu = get_tu('int foo(int i);') + foo = get_cursor(tu, 'foo') + + tokens = list(foo.get_tokens()) + assert len(tokens) == 7 + assert tokens[0].spelling == 'int' + assert tokens[1].spelling == 'foo' diff --git a/bindings/python/tests/cindex/test_token_kind.py b/bindings/python/tests/cindex/test_token_kind.py new file mode 100644 index 0000000000..62ec63e0ad --- /dev/null +++ b/bindings/python/tests/cindex/test_token_kind.py @@ -0,0 +1,43 @@ +from clang.cindex import TokenKind +from nose.tools import eq_ +from nose.tools import ok_ +from nose.tools import raises + +def test_constructor(): + """Ensure TokenKind constructor works as expected.""" + + t = TokenKind(5, 'foo') + + eq_(t.value, 5) + eq_(t.name, 'foo') + +@raises(ValueError) +def test_bad_register(): + """Ensure a duplicate value is rejected for registration.""" + + TokenKind.register(2, 'foo') + +@raises(ValueError) +def test_unknown_value(): + """Ensure trying to fetch an unknown value raises.""" + + TokenKind.from_value(-1) + +def test_registration(): + """Ensure that items registered appear as class attributes.""" + ok_(hasattr(TokenKind, 'LITERAL')) + literal = TokenKind.LITERAL + + ok_(isinstance(literal, TokenKind)) + +def test_from_value(): + """Ensure registered values can be obtained from from_value().""" + t = TokenKind.from_value(3) + ok_(isinstance(t, TokenKind)) + eq_(t, TokenKind.LITERAL) + +def test_repr(): + """Ensure repr() works.""" + + r = repr(TokenKind.LITERAL) + eq_(r, 'TokenKind.LITERAL') diff --git a/bindings/python/tests/cindex/test_tokens.py b/bindings/python/tests/cindex/test_tokens.py new file mode 100644 index 0000000000..7074842909 --- /dev/null +++ b/bindings/python/tests/cindex/test_tokens.py @@ -0,0 +1,52 @@ +from clang.cindex import CursorKind +from clang.cindex import Index +from clang.cindex import SourceLocation +from clang.cindex import SourceRange +from clang.cindex import TokenKind +from nose.tools import eq_ +from nose.tools import ok_ + +from .util import get_tu + +def test_token_to_cursor(): + """Ensure we can obtain a Cursor from a Token instance.""" + tu = get_tu('int i = 5;') + r = tu.get_extent('t.c', (0, 9)) + tokens = list(tu.get_tokens(extent=r)) + + assert len(tokens) == 5 + assert tokens[1].spelling == 'i' + assert tokens[1].kind == TokenKind.IDENTIFIER + + cursor = tokens[1].cursor + assert cursor.kind == CursorKind.VAR_DECL + assert tokens[1].cursor == tokens[2].cursor + +def test_token_location(): + """Ensure Token.location works.""" + + tu = get_tu('int foo = 10;') + r = tu.get_extent('t.c', (0, 11)) + + tokens = list(tu.get_tokens(extent=r)) + eq_(len(tokens), 4) + + loc = tokens[1].location + ok_(isinstance(loc, SourceLocation)) + eq_(loc.line, 1) + eq_(loc.column, 5) + eq_(loc.offset, 4) + +def test_token_extent(): + """Ensure Token.extent works.""" + tu = get_tu('int foo = 10;') + r = tu.get_extent('t.c', (0, 11)) + + tokens = list(tu.get_tokens(extent=r)) + eq_(len(tokens), 4) + + extent = tokens[1].extent + ok_(isinstance(extent, SourceRange)) + + eq_(extent.start.offset, 4) + eq_(extent.end.offset, 7) diff --git a/bindings/python/tests/cindex/test_translation_unit.py b/bindings/python/tests/cindex/test_translation_unit.py index 9de12ad462..c91f126097 100644 --- a/bindings/python/tests/cindex/test_translation_unit.py +++ b/bindings/python/tests/cindex/test_translation_unit.py @@ -1,3 +1,6 @@ +import gc +import os + from clang.cindex import CursorKind from clang.cindex import Cursor from clang.cindex import File @@ -8,7 +11,6 @@ from clang.cindex import TranslationUnitSaveError from clang.cindex import TranslationUnit from .util import get_cursor from .util import get_tu -import os kInputsDir = os.path.join(os.path.dirname(__file__), 'INPUTS') @@ -217,3 +219,23 @@ def test_get_source_range(): assert r.end.offset == 5 assert r.start.file.name == 't.c' assert r.end.file.name == 't.c' + +def test_get_tokens_gc(): + """Ensures get_tokens() works properly with garbage collection.""" + + tu = get_tu('int foo();') + r = tu.get_extent('t.c', (0, 10)) + tokens = list(tu.get_tokens(extent=r)) + + assert tokens[0].spelling == 'int' + gc.collect() + assert tokens[0].spelling == 'int' + + del tokens[1] + gc.collect() + assert tokens[0].spelling == 'int' + + # May trigger segfault if we don't do our job properly. + del tokens + gc.collect() + gc.collect() # Just in case. -- 2.40.0