[Lexer] Finding beginning of token with escaped new line

author Alexander Kornienko <alexfh@google.com>

Thu, 10 Aug 2017 10:06:16 +0000 (10:06 +0000)

committer Alexander Kornienko <alexfh@google.com>

Thu, 10 Aug 2017 10:06:16 +0000 (10:06 +0000)
author Alexander Kornienko <alexfh@google.com>
Thu, 10 Aug 2017 10:06:16 +0000 (10:06 +0000)
committer Alexander Kornienko <alexfh@google.com>
Thu, 10 Aug 2017 10:06:16 +0000 (10:06 +0000)
diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h

index 3be733167e5cfd230bbf2845f5b4ba48172d2a85..aa8bf3891ed966a9dac2fc66be88f0c2f43f8180 100644 (file)
--- a/include/clang/Lex/Lexer.h
+++ b/include/clang/Lex/Lexer.h
@@ -463,6 +463,10 @@ public:
    /// \brief Returns true if the given character could appear in an identifier.
    static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
  
+  /// \brief Checks whether new line pointed by Str is preceded by escape
+  /// sequence.
+  static bool isNewLineEscaped(const char *BufferStart, const char *Str);
+
    /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
    /// emit a warning.
    static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp

index 61bcef8cb760e52c9be65b05f70b44700d432b56..79472961c010e64c94d2a5b5d4d9e0b2d2d64234 100644 (file)
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -463,19 +463,15 @@ static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
    const char *BufStart = Buffer.data();
    if (Offset >= Buffer.size())
      return nullptr;
-  const char *StrData = BufStart + Offset;
  
-  if (StrData[0] == '\n' || StrData[0] == '\r')
-    return StrData;
-
-  const char *LexStart = StrData;
-  while (LexStart != BufStart) {
-    if (LexStart[0] == '\n' || LexStart[0] == '\r') {
+  const char *LexStart = BufStart + Offset;
+  for (; LexStart != BufStart; --LexStart) {
+    if (isVerticalWhitespace(LexStart[0]) &&
+        !Lexer::isNewLineEscaped(BufStart, LexStart)) {
+      // LexStart should point at first character of logical line.
        ++LexStart;
        break;
      }
-
-    --LexStart;
    }
    return LexStart;
  }
@@ -487,7 +483,7 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
    std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
    if (LocInfo.first.isInvalid())
      return Loc;
-  
+
    bool Invalid = false;
    StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
    if (Invalid)
@@ -499,31 +495,31 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
    const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
    if (!LexStart || LexStart == StrData)
      return Loc;
-  
+
    // Create a lexer starting at the beginning of this token.
    SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
    Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
                   Buffer.end());
    TheLexer.SetCommentRetentionState(true);
-  
+
    // Lex tokens until we find the token that contains the source location.
    Token TheTok;
    do {
      TheLexer.LexFromRawLexer(TheTok);
-    
+
      if (TheLexer.getBufferLocation() > StrData) {
        // Lexing this token has taken the lexer past the source location we're
        // looking for. If the current token encompasses our source location,
        // return the beginning of that token.
        if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
          return TheTok.getLocation();
-      
+
        // We ended up skipping over the source location entirely, which means
        // that it points into whitespace. We're done here.
        break;
      }
    } while (TheTok.getKind() != tok::eof);
-  
+
    // We've passed our source location; just return the original source location.
    return Loc;
  }
@@ -531,20 +527,20 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
  SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
                                            const SourceManager &SM,
                                            const LangOptions &LangOpts) {
- if (Loc.isFileID())
-   return getBeginningOfFileToken(Loc, SM, LangOpts);
- 
- if (!SM.isMacroArgExpansion(Loc))
-   return Loc;
+  if (Loc.isFileID())
+    return getBeginningOfFileToken(Loc, SM, LangOpts);
+
+  if (!SM.isMacroArgExpansion(Loc))
+    return Loc;
  
- SourceLocation FileLoc = SM.getSpellingLoc(Loc);
- SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
- std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
- std::pair<FileID, unsigned> BeginFileLocInfo
-   = SM.getDecomposedLoc(BeginFileLoc);
- assert(FileLocInfo.first == BeginFileLocInfo.first &&
-        FileLocInfo.second >= BeginFileLocInfo.second);
- return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
+  SourceLocation FileLoc = SM.getSpellingLoc(Loc);
+  SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
+  std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
+  std::pair<FileID, unsigned> BeginFileLocInfo =
+      SM.getDecomposedLoc(BeginFileLoc);
+  assert(FileLocInfo.first == BeginFileLocInfo.first &&
+         FileLocInfo.second >= BeginFileLocInfo.second);
+  return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
  }
  
  namespace {
@@ -1032,6 +1028,26 @@ bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
    return isIdentifierBody(c, LangOpts.DollarIdents);
  }
  
+bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
+  assert(isVerticalWhitespace(Str[0]));
+  if (Str - 1 < BufferStart)
+    return false;
+
+  if ((Str[0] == '\n' && Str[-1] == '\r') ||
+      (Str[0] == '\r' && Str[-1] == '\n')) {
+    if (Str - 2 < BufferStart)
+      return false;
+    --Str;
+  }
+  --Str;
+
+  // Rewind to first non-space character:
+  while (Str > BufferStart && isHorizontalWhitespace(*Str))
+    --Str;
+
+  return *Str == '\\';
+}
+
  StringRef Lexer::getIndentationForLine(SourceLocation Loc,
                                         const SourceManager &SM) {
    if (Loc.isInvalid() || Loc.isMacroID())
diff --git a/unittests/Lex/LexerTest.cpp b/unittests/Lex/LexerTest.cpp

index 923aff18472b6075499cae929f2a175c7e6c27e6..35eee121384818476f9573b2f722541e5823497c 100644 (file)
--- a/unittests/Lex/LexerTest.cpp
+++ b/unittests/Lex/LexerTest.cpp
@@ -420,4 +420,57 @@ TEST_F(LexerTest, DontOverallocateStringifyArgs) {
  #endif
  }
  
+TEST_F(LexerTest, IsNewLineEscapedValid) {
+  auto hasNewLineEscaped = [](const char *S) {
+    return Lexer::isNewLineEscaped(S, S + strlen(S) - 1);
+  };
+
+  EXPECT_TRUE(hasNewLineEscaped("\\\r"));
+  EXPECT_TRUE(hasNewLineEscaped("\\\n"));
+  EXPECT_TRUE(hasNewLineEscaped("\\\r\n"));
+  EXPECT_TRUE(hasNewLineEscaped("\\\n\r"));
+  EXPECT_TRUE(hasNewLineEscaped("\\ \t\v\f\r"));
+  EXPECT_TRUE(hasNewLineEscaped("\\ \t\v\f\r\n"));
+
+  EXPECT_FALSE(hasNewLineEscaped("\\\r\r"));
+  EXPECT_FALSE(hasNewLineEscaped("\\\r\r\n"));
+  EXPECT_FALSE(hasNewLineEscaped("\\\n\n"));
+  EXPECT_FALSE(hasNewLineEscaped("\r"));
+  EXPECT_FALSE(hasNewLineEscaped("\n"));
+  EXPECT_FALSE(hasNewLineEscaped("\r\n"));
+  EXPECT_FALSE(hasNewLineEscaped("\n\r"));
+  EXPECT_FALSE(hasNewLineEscaped("\r\r"));
+  EXPECT_FALSE(hasNewLineEscaped("\n\n"));
+}
+
+TEST_F(LexerTest, GetBeginningOfTokenWithEscapedNewLine) {
+  // Each line should have the same length for
+  // further offset calculation to be more straightforward.
+  const unsigned IdentifierLength = 8;
+  std::string TextToLex = "rabarbar\n"
+                          "foo\\\nbar\n"
+                          "foo\\\rbar\n"
+                          "fo\\\r\nbar\n"
+                          "foo\\\n\rba\n";
+  std::vector<tok::TokenKind> ExpectedTokens{5, tok::identifier};
+  std::vector<Token> LexedTokens = CheckLex(TextToLex, ExpectedTokens);
+
+  for (const Token &Tok : LexedTokens) {
+    std::pair<FileID, unsigned> OriginalLocation =
+        SourceMgr.getDecomposedLoc(Tok.getLocation());
+    for (unsigned Offset = 0; Offset < IdentifierLength; ++Offset) {
+      SourceLocation LookupLocation =
+          Tok.getLocation().getLocWithOffset(Offset);
+
+      std::pair<FileID, unsigned> FoundLocation =
+          SourceMgr.getDecomposedExpansionLoc(
+              Lexer::GetBeginningOfToken(LookupLocation, SourceMgr, LangOpts));
+
+      // Check that location returned by the GetBeginningOfToken
+      // is the same as original token location reported by Lexer.
+      EXPECT_EQ(FoundLocation.second, OriginalLocation.second);
+    }
+  }
+}
+
  } // anonymous namespace
author	Alexander Kornienko <alexfh@google.com>
	Thu, 10 Aug 2017 10:06:16 +0000 (10:06 +0000)
committer	Alexander Kornienko <alexfh@google.com>
	Thu, 10 Aug 2017 10:06:16 +0000 (10:06 +0000)
include/clang/Lex/Lexer.h		patch \| blob \| history
lib/Lex/Lexer.cpp		patch \| blob \| history
unittests/Lex/LexerTest.cpp		patch \| blob \| history