From d88dc48e33d71732708960170e57a3d1bdc8f847 Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Sun, 12 Oct 2008 04:05:48 +0000 Subject: [PATCH] Add a new mode to the lexer which enables it to return all characters, even whitespace, as tokens from the file. This is enabled with L->SetKeepWhitespaceMode(true) on a raw lexer. In this mode, you too can use clang as a really complex version of 'cat' with code like this: Lexer RawLex(SourceLocation::getFileLoc(SM.getMainFileID(), 0), PP.getLangOptions(), File.first, File.second); RawLex.SetKeepWhitespaceMode(true); Token RawTok; RawLex.LexFromRawLexer(RawTok); while (RawTok.isNot(tok::eof)) { std::cout << PP.getSpelling(RawTok); RawLex.LexFromRawLexer(RawTok); } This will emit exactly the input file, with no canonicalization or other translation. Realistic clients actually do something with the tokens of course :) git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@57401 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Lex/Lexer.h | 45 ++++++++++++++++++++------- lib/Lex/Lexer.cpp | 64 ++++++++++++++++++++++++++++++--------- 2 files changed, 83 insertions(+), 26 deletions(-) diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h index 24189dfe54..7267644eb5 100644 --- a/include/clang/Lex/Lexer.h +++ b/include/clang/Lex/Lexer.h @@ -66,9 +66,14 @@ class Lexer { /// Note that in raw mode that the PP pointer may be null. bool LexingRawMode; - /// KeepCommentMode - The lexer can optionally keep C & BCPL-style comments, - /// and return them as tokens. This is used for -C and -CC modes. - bool KeepCommentMode; + /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace + /// and return them as tokens. This is used for -C and -CC modes, and + /// whitespace preservation can be useful for some clients that want to lex + /// the file in raw mode and get every character from the file. + /// + /// When this is set to 2 it returns comments and whitespace. When set to 1 + /// it returns comments, when it is set to 0 it returns normal tokens only. + unsigned char ExtendedTokenMode; //===--------------------------------------------------------------------===// // Context that changes as the file is lexed. @@ -150,18 +155,36 @@ public: // lexer when in raw mode. return BufferPtr == BufferEnd; } + + /// isKeepWhitespaceMode - Return true if the lexer should return tokens for + /// every character in the file, including whitespace and comments. This + /// should only be used in raw mode, as the preprocessor is not prepared to + /// deal with the excess tokens. + bool isKeepWhitespaceMode() const { + return ExtendedTokenMode > 1; + } + + /// SetKeepWhitespaceMode - This method lets clients enable or disable + /// whitespace retention mode. + void SetKeepWhitespaceMode(bool Val) { + assert((!Val || LexingRawMode) && + "Can only enable whitespace retention in raw mode"); + ExtendedTokenMode = Val ? 2 : 0; + } + + /// inKeepCommentMode - Return true if the lexer should return comments as + /// tokens. + bool inKeepCommentMode() const { + return ExtendedTokenMode > 0; + } /// SetCommentRetentionMode - Change the comment retention mode of the lexer /// to the specified mode. This is really only useful when lexing in raw /// mode, because otherwise the lexer needs to manage this. void SetCommentRetentionState(bool Mode) { - KeepCommentMode = Mode; - } - - /// inKeepCommentMode - Return true if the lexer should return comments as - /// tokens. - bool inKeepCommentMode() const { - return KeepCommentMode; + assert(!isKeepWhitespaceMode() && + "Can't play with comment retention state when retaining whitespace"); + ExtendedTokenMode = Mode ? 1 : 0; } @@ -370,7 +393,7 @@ private: void LexCharConstant (Token &Result, const char *CurPtr); bool LexEndOfFile (Token &Result, const char *CurPtr); - void SkipWhitespace (Token &Result, const char *CurPtr); + bool SkipWhitespace (Token &Result, const char *CurPtr); bool SkipBCPLComment (Token &Result, const char *CurPtr); bool SkipBlockComment (Token &Result, const char *CurPtr); bool SaveBCPLComment (Token &Result, const char *CurPtr); diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index d2aef76526..65956cb8b7 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -99,8 +99,8 @@ Lexer::Lexer(SourceLocation fileloc, Preprocessor &pp, // or otherwise skipping over tokens. LexingRawMode = false; - // Default to keeping comments if requested. - KeepCommentMode = false; + // Default to keeping comments if the preprocessor wants them. + ExtendedTokenMode = 0; SetCommentRetentionState(PP->getCommentRetentionState()); } @@ -137,7 +137,7 @@ Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, LexingRawMode = true; // Default to not keeping comments in raw mode. - KeepCommentMode = false; + ExtendedTokenMode = 0; } @@ -591,7 +591,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { /// LexStringLiteral - Lex the remainder of a string literal, after having lexed /// either " or L". -void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide){ +void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { const char *NulCharacter = 0; // Does this string contain the \0 character? char C = getAndAdvanceChar(CurPtr, Result); @@ -704,7 +704,10 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { /// SkipWhitespace - Efficiently skip over a series of whitespace characters. /// Update BufferPtr to point to the next non-whitespace character and return. -void Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { +/// +/// This method forms a token and returns true if KeepWhitespaceMode is enabled. +/// +bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { // Whitespace - Skip it, then return the token after the whitespace. unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. while (1) { @@ -719,7 +722,7 @@ void Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { if (ParsingPreprocessorDirective) { // End of preprocessor directive line, let LexTokenInternal handle this. BufferPtr = CurPtr; - return; + return false; } // ok, but handle newline. @@ -735,7 +738,15 @@ void Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { if (PrevChar != '\n' && PrevChar != '\r') Result.setFlag(Token::LeadingSpace); + // If the client wants us to return whitespace, return it now. + if (isKeepWhitespaceMode()) { + Result.setKind(tok::unknown); + FormTokenWithChars(Result, CurPtr); + return true; + } + BufferPtr = CurPtr; + return false; } // SkipBCPLComment - We have just read the // characters from input. Skip until @@ -817,7 +828,9 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { // Otherwise, eat the \n character. We don't care if this is a \n\r or // \r\n sequence. This is an efficiency hack (because we know the \n can't - // contribute to another token), it isn't needed for correctness. + // contribute to another token), it isn't needed for correctness. Note that + // this is ok even in KeepWhitespaceMode, because we would have returned the + /// comment above in that mode. ++CurPtr; // The next returned token is at the start of the line. @@ -832,11 +845,16 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { /// an appropriate way and return it. bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { Result.setKind(tok::comment); - FormTokenWithChars(Result, CurPtr); - // If this BCPL-style comment is in a macro definition, transmogrify it into - // a C-style block comment. - if (ParsingPreprocessorDirective) { + if (!ParsingPreprocessorDirective) { + // If we're not in a preprocessor directive, just return the // comment + // directly. + FormTokenWithChars(Result, CurPtr); + } else { + // If this BCPL-style comment is in a macro definition, transmogrify it into + // a C-style block comment. + BufferPtr = CurPtr; + std::string Spelling = PP->getSpelling(Result); assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); Spelling[1] = '*'; // Change prefix to "/*". @@ -1024,7 +1042,8 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { // It is common for the tokens immediately after a /**/ comment to be // whitespace. Instead of going through the big switch, handle it - // efficiently now. + // efficiently now. This is safe even in KeepWhitespaceMode because we would + // have already returned above with the comment as a token. if (isHorizontalWhitespace(*CurPtr)) { Result.setFlag(Token::LeadingSpace); SkipWhitespace(Result, CurPtr+1); @@ -1203,6 +1222,16 @@ LexNextToken: ++CurPtr; while ((*CurPtr == ' ') || (*CurPtr == '\t')) ++CurPtr; + + // If we are keeping whitespace and other tokens, just return what we just + // skipped. The next lexer invocation will return the token after the + // whitespace. + if (isKeepWhitespaceMode()) { + Result.setKind(tok::unknown); + FormTokenWithChars(Result, CurPtr); + return; + } + BufferPtr = CurPtr; Result.setFlag(Token::LeadingSpace); } @@ -1226,7 +1255,9 @@ LexNextToken: Diag(CurPtr-1, diag::null_in_file); Result.setFlag(Token::LeadingSpace); - SkipWhitespace(Result, CurPtr); + if (SkipWhitespace(Result, CurPtr)) + return; // KeepWhitespaceMode + goto LexNextToken; // GCC isn't tail call eliminating. case '\n': case '\r': @@ -1249,7 +1280,9 @@ LexNextToken: Result.setFlag(Token::StartOfLine); // No leading whitespace seen so far. Result.clearFlag(Token::LeadingSpace); - SkipWhitespace(Result, CurPtr); + + if (SkipWhitespace(Result, CurPtr)) + return; // KeepWhitespaceMode goto LexNextToken; // GCC isn't tail call eliminating. case ' ': case '\t': @@ -1257,7 +1290,8 @@ LexNextToken: case '\v': SkipHorizontalWhitespace: Result.setFlag(Token::LeadingSpace); - SkipWhitespace(Result, CurPtr); + if (SkipWhitespace(Result, CurPtr)) + return; // KeepWhitespaceMode SkipIgnoredUnits: CurPtr = BufferPtr; -- 2.40.0