From ed3baeed7e73e7869cbe0879d795a58c9fe30b92 Mon Sep 17 00:00:00 2001 From: Zachary Turner Date: Mon, 9 Oct 2017 15:46:13 +0000 Subject: [PATCH] [llvm-rc] Have the tokenizer discard single & block comments. This allows rc files to have comments. Eventually we should just use clang's c preprocessor, but that's a bit larger effort for minimal gain, and this is straightforward. Differential Revision: https://reviews.llvm.org/D38651 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@315207 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-rc/Inputs/tokens.rc | 8 ++++ test/tools/llvm-rc/tokenizer.test | 9 +++++ tools/llvm-rc/ResourceScriptToken.cpp | 53 +++++++++++++++++++++++++ tools/llvm-rc/ResourceScriptTokenList.h | 2 + 4 files changed, 72 insertions(+) diff --git a/test/tools/llvm-rc/Inputs/tokens.rc b/test/tools/llvm-rc/Inputs/tokens.rc index 20619149bb0..217d6017a9d 100644 --- a/test/tools/llvm-rc/Inputs/tokens.rc +++ b/test/tools/llvm-rc/Inputs/tokens.rc @@ -3,6 +3,14 @@ He11o LLVM "RC string test.",L"Another RC string test.'&{",42,100 +Block Comment Ident /*block /* // comment */ ifier +Line Comment // Identifier /* + +/* Multi line + block + comment */ + +Multiple /* comments */ on /* a */ single // line ":))" diff --git a/test/tools/llvm-rc/tokenizer.test b/test/tools/llvm-rc/tokenizer.test index 08c01a2fe73..99cd0f24b50 100644 --- a/test/tools/llvm-rc/tokenizer.test +++ b/test/tools/llvm-rc/tokenizer.test @@ -34,4 +34,13 @@ ; CHECK-NEXT: Int: 42; int value = 42 ; CHECK-NEXT: Comma: , ; CHECK-NEXT: Int: 100; int value = 100 +; CHECK-NEXT: Identifier: Block +; CHECK-NEXT: Identifier: Comment +; CHECK-NEXT: Identifier: Ident +; CHECK-NEXT: Identifier: ifier +; CHECK-NEXT: Identifier: Line +; CHECK-NEXT: Identifier: Comment +; CHECK-NEXT: Identifier: Multiple +; CHECK-NEXT: Identifier: on +; CHECK-NEXT: Identifier: single ; CHECK-NEXT: String: ":))" diff --git a/tools/llvm-rc/ResourceScriptToken.cpp b/tools/llvm-rc/ResourceScriptToken.cpp index 061070b479e..5a3473a4b08 100644 --- a/tools/llvm-rc/ResourceScriptToken.cpp +++ b/tools/llvm-rc/ResourceScriptToken.cpp @@ -121,6 +121,17 @@ private: bool canStartString() const; + // Check if tokenizer can start reading a single line comment (e.g. a comment + // that begins with '//') + bool canStartLineComment() const; + + // Check if tokenizer can start or finish reading a block comment (e.g. a + // comment that begins with '/*' and ends with '*/') + bool canStartBlockComment() const; + + // Throw away all remaining characters on the current line. + void skipCurrentLine(); + bool streamEof() const; // Classify the token that is about to be read from the current position. @@ -134,6 +145,14 @@ private: size_t DataLength, Pos; }; +void Tokenizer::skipCurrentLine() { + Pos = Data.find_first_of("\r\n", Pos); + Pos = Data.find_first_not_of("\r\n", Pos); + + if (Pos == StringRef::npos) + Pos = DataLength; +} + Expected> Tokenizer::run() { Pos = 0; std::vector Result; @@ -154,6 +173,10 @@ Expected> Tokenizer::run() { if (Error TokenError = consumeToken(TokenKind)) return std::move(TokenError); + // Comments are just deleted, don't bother saving them. + if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment) + continue; + RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart)); if (TokenKind == Kind::Identifier) { processIdentifier(Token); @@ -195,6 +218,21 @@ Error Tokenizer::consumeToken(const Kind TokenKind) { advance(); return Error::success(); + case Kind::LineComment: + advance(2); + skipCurrentLine(); + return Error::success(); + + case Kind::StartComment: { + advance(2); + auto EndPos = Data.find("*/", Pos); + if (EndPos == StringRef::npos) + return getStringError( + "Unclosed multi-line comment beginning at position " + Twine(Pos)); + advance(EndPos - Pos); + advance(2); + return Error::success(); + } case Kind::Identifier: while (!streamEof() && canContinueIdentifier()) advance(); @@ -259,6 +297,16 @@ bool Tokenizer::canStartInt() const { return std::isdigit(Data[Pos]); } +bool Tokenizer::canStartBlockComment() const { + assert(!streamEof()); + return Data.drop_front(Pos).startswith("/*"); +} + +bool Tokenizer::canStartLineComment() const { + assert(!streamEof()); + return Data.drop_front(Pos).startswith("//"); +} + bool Tokenizer::canContinueInt() const { assert(!streamEof()); return std::isalnum(Data[Pos]); @@ -271,6 +319,11 @@ bool Tokenizer::canStartString() const { bool Tokenizer::streamEof() const { return Pos == DataLength; } Kind Tokenizer::classifyCurrentToken() const { + if (canStartBlockComment()) + return Kind::StartComment; + if (canStartLineComment()) + return Kind::LineComment; + if (canStartInt()) return Kind::Int; if (canStartString()) diff --git a/tools/llvm-rc/ResourceScriptTokenList.h b/tools/llvm-rc/ResourceScriptTokenList.h index f8d7303e7a8..2a7e15f9332 100644 --- a/tools/llvm-rc/ResourceScriptTokenList.h +++ b/tools/llvm-rc/ResourceScriptTokenList.h @@ -18,6 +18,8 @@ TOKEN(Invalid) // Invalid token. Should not occur in a valid script. TOKEN(Int) // Integer (decimal, octal or hexadecimal). TOKEN(String) // String value. TOKEN(Identifier) // Script identifier (resource name or type). +TOKEN(LineComment) // Beginning of single-line comment. +TOKEN(StartComment) // Beginning of multi-line comment. // Short tokens. They usually consist of exactly one character. // The definitions are of the form SHORT_TOKEN(TokenName, TokenChar). -- 2.40.0