From a2fc7dbc981deab96d1633331da793e2333c7521 Mon Sep 17 00:00:00 2001 From: Alexander Kornienko Date: Thu, 10 Aug 2017 10:06:16 +0000 Subject: [PATCH] [Lexer] Finding beginning of token with escaped new line MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Summary: Lexer::GetBeginningOfToken produced invalid location when backtracking across escaped new lines. This fixes PR26228 Reviewers: akyrtzi, alexfh, rsmith, doug.gregor Reviewed By: alexfh Subscribers: alexfh, cfe-commits Patch by Paweł Żukowski! Differential Revision: https://reviews.llvm.org/D30748 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@310576 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Lex/Lexer.h | 4 +++ lib/Lex/Lexer.cpp | 72 ++++++++++++++++++++++--------------- unittests/Lex/LexerTest.cpp | 53 +++++++++++++++++++++++++++ 3 files changed, 101 insertions(+), 28 deletions(-) diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h index 3be733167e..aa8bf3891e 100644 --- a/include/clang/Lex/Lexer.h +++ b/include/clang/Lex/Lexer.h @@ -463,6 +463,10 @@ public: /// \brief Returns true if the given character could appear in an identifier. static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts); + /// \brief Checks whether new line pointed by Str is preceded by escape + /// sequence. + static bool isNewLineEscaped(const char *BufferStart, const char *Str); + /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever /// emit a warning. static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size, diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index 61bcef8cb7..79472961c0 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -463,19 +463,15 @@ static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { const char *BufStart = Buffer.data(); if (Offset >= Buffer.size()) return nullptr; - const char *StrData = BufStart + Offset; - if (StrData[0] == '\n' || StrData[0] == '\r') - return StrData; - - const char *LexStart = StrData; - while (LexStart != BufStart) { - if (LexStart[0] == '\n' || LexStart[0] == '\r') { + const char *LexStart = BufStart + Offset; + for (; LexStart != BufStart; --LexStart) { + if (isVerticalWhitespace(LexStart[0]) && + !Lexer::isNewLineEscaped(BufStart, LexStart)) { + // LexStart should point at first character of logical line. ++LexStart; break; } - - --LexStart; } return LexStart; } @@ -487,7 +483,7 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc, std::pair LocInfo = SM.getDecomposedLoc(Loc); if (LocInfo.first.isInvalid()) return Loc; - + bool Invalid = false; StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); if (Invalid) @@ -499,31 +495,31 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); if (!LexStart || LexStart == StrData) return Loc; - + // Create a lexer starting at the beginning of this token. SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, Buffer.end()); TheLexer.SetCommentRetentionState(true); - + // Lex tokens until we find the token that contains the source location. Token TheTok; do { TheLexer.LexFromRawLexer(TheTok); - + if (TheLexer.getBufferLocation() > StrData) { // Lexing this token has taken the lexer past the source location we're // looking for. If the current token encompasses our source location, // return the beginning of that token. if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) return TheTok.getLocation(); - + // We ended up skipping over the source location entirely, which means // that it points into whitespace. We're done here. break; } } while (TheTok.getKind() != tok::eof); - + // We've passed our source location; just return the original source location. return Loc; } @@ -531,20 +527,20 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc, SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { - if (Loc.isFileID()) - return getBeginningOfFileToken(Loc, SM, LangOpts); - - if (!SM.isMacroArgExpansion(Loc)) - return Loc; + if (Loc.isFileID()) + return getBeginningOfFileToken(Loc, SM, LangOpts); + + if (!SM.isMacroArgExpansion(Loc)) + return Loc; - SourceLocation FileLoc = SM.getSpellingLoc(Loc); - SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); - std::pair FileLocInfo = SM.getDecomposedLoc(FileLoc); - std::pair BeginFileLocInfo - = SM.getDecomposedLoc(BeginFileLoc); - assert(FileLocInfo.first == BeginFileLocInfo.first && - FileLocInfo.second >= BeginFileLocInfo.second); - return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); + SourceLocation FileLoc = SM.getSpellingLoc(Loc); + SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); + std::pair FileLocInfo = SM.getDecomposedLoc(FileLoc); + std::pair BeginFileLocInfo = + SM.getDecomposedLoc(BeginFileLoc); + assert(FileLocInfo.first == BeginFileLocInfo.first && + FileLocInfo.second >= BeginFileLocInfo.second); + return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); } namespace { @@ -1032,6 +1028,26 @@ bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { return isIdentifierBody(c, LangOpts.DollarIdents); } +bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { + assert(isVerticalWhitespace(Str[0])); + if (Str - 1 < BufferStart) + return false; + + if ((Str[0] == '\n' && Str[-1] == '\r') || + (Str[0] == '\r' && Str[-1] == '\n')) { + if (Str - 2 < BufferStart) + return false; + --Str; + } + --Str; + + // Rewind to first non-space character: + while (Str > BufferStart && isHorizontalWhitespace(*Str)) + --Str; + + return *Str == '\\'; +} + StringRef Lexer::getIndentationForLine(SourceLocation Loc, const SourceManager &SM) { if (Loc.isInvalid() || Loc.isMacroID()) diff --git a/unittests/Lex/LexerTest.cpp b/unittests/Lex/LexerTest.cpp index 923aff1847..35eee12138 100644 --- a/unittests/Lex/LexerTest.cpp +++ b/unittests/Lex/LexerTest.cpp @@ -420,4 +420,57 @@ TEST_F(LexerTest, DontOverallocateStringifyArgs) { #endif } +TEST_F(LexerTest, IsNewLineEscapedValid) { + auto hasNewLineEscaped = [](const char *S) { + return Lexer::isNewLineEscaped(S, S + strlen(S) - 1); + }; + + EXPECT_TRUE(hasNewLineEscaped("\\\r")); + EXPECT_TRUE(hasNewLineEscaped("\\\n")); + EXPECT_TRUE(hasNewLineEscaped("\\\r\n")); + EXPECT_TRUE(hasNewLineEscaped("\\\n\r")); + EXPECT_TRUE(hasNewLineEscaped("\\ \t\v\f\r")); + EXPECT_TRUE(hasNewLineEscaped("\\ \t\v\f\r\n")); + + EXPECT_FALSE(hasNewLineEscaped("\\\r\r")); + EXPECT_FALSE(hasNewLineEscaped("\\\r\r\n")); + EXPECT_FALSE(hasNewLineEscaped("\\\n\n")); + EXPECT_FALSE(hasNewLineEscaped("\r")); + EXPECT_FALSE(hasNewLineEscaped("\n")); + EXPECT_FALSE(hasNewLineEscaped("\r\n")); + EXPECT_FALSE(hasNewLineEscaped("\n\r")); + EXPECT_FALSE(hasNewLineEscaped("\r\r")); + EXPECT_FALSE(hasNewLineEscaped("\n\n")); +} + +TEST_F(LexerTest, GetBeginningOfTokenWithEscapedNewLine) { + // Each line should have the same length for + // further offset calculation to be more straightforward. + const unsigned IdentifierLength = 8; + std::string TextToLex = "rabarbar\n" + "foo\\\nbar\n" + "foo\\\rbar\n" + "fo\\\r\nbar\n" + "foo\\\n\rba\n"; + std::vector ExpectedTokens{5, tok::identifier}; + std::vector LexedTokens = CheckLex(TextToLex, ExpectedTokens); + + for (const Token &Tok : LexedTokens) { + std::pair OriginalLocation = + SourceMgr.getDecomposedLoc(Tok.getLocation()); + for (unsigned Offset = 0; Offset < IdentifierLength; ++Offset) { + SourceLocation LookupLocation = + Tok.getLocation().getLocWithOffset(Offset); + + std::pair FoundLocation = + SourceMgr.getDecomposedExpansionLoc( + Lexer::GetBeginningOfToken(LookupLocation, SourceMgr, LangOpts)); + + // Check that location returned by the GetBeginningOfToken + // is the same as original token location reported by Lexer. + EXPECT_EQ(FoundLocation.second, OriginalLocation.second); + } + } +} + } // anonymous namespace -- 2.40.0