From: Alexander Kornienko <alexfh@google.com>
Date: Thu, 29 Aug 2013 17:32:57 +0000 (+0000)
Subject: Better support for multiline string literals (including C++11 raw string literals).
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=dcc0c5bb7ce9a731ecbc0b8e8477979cd9e730c0;p=clang

Better support for multiline string literals (including C++11 raw string literals).

Summary:
Calculate characters in the first and the last line correctly so that
we only break before the literal when needed.

Reviewers: djasper

Reviewed By: djasper

CC: cfe-commits, klimek

Differential Revision: http://llvm-reviews.chandlerc.com/D1544

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@189595 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Format/ContinuationIndenter.cpp b/lib/Format/ContinuationIndenter.cpp
index f4ae5cce01..718e4a5cde 100644
--- a/lib/Format/ContinuationIndenter.cpp
+++ b/lib/Format/ContinuationIndenter.cpp
@@ -579,6 +579,31 @@ unsigned ContinuationIndenter::moveStateToNextToken(LineState &State,
   return Penalty;
 }
 
+unsigned
+ContinuationIndenter::addMultilineStringLiteral(const FormatToken &Current,
+                                                LineState &State) {
+  StringRef Text = Current.TokenText;
+  // We can only affect layout of the first and the last line, so the penalty
+  // for all other lines is constant, and we ignore it.
+  size_t FirstLineBreak = Text.find('\n');
+  size_t LastLineBreak = Text.find_last_of('\n');
+  assert(FirstLineBreak != StringRef::npos);
+  unsigned StartColumn = State.Column - Current.CodePointCount;
+  State.Column =
+      encoding::getCodePointCount(Text.substr(LastLineBreak + 1), Encoding);
+
+  // Break before further function parameters on all levels.
+  for (unsigned i = 0, e = State.Stack.size(); i != e; ++i)
+    State.Stack[i].BreakBeforeParameter = true;
+
+  unsigned ColumnsUsed =
+      StartColumn +
+      encoding::getCodePointCount(Text.substr(0, FirstLineBreak), Encoding);
+  if (ColumnsUsed > getColumnLimit())
+    return Style.PenaltyExcessCharacter * (ColumnsUsed - getColumnLimit());
+  return 0;
+}
+
 unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current,
                                                     LineState &State,
                                                     bool DryRun) {
@@ -587,19 +612,18 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current,
 
   llvm::OwningPtr<BreakableToken> Token;
   unsigned StartColumn = State.Column - Current.CodePointCount;
-  unsigned OriginalStartColumn =
-      SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) - 1;
 
   if (Current.is(tok::string_literal) &&
       Current.Type != TT_ImplicitStringLiteral) {
+    // Don't break string literals with (in case of non-raw strings, escaped)
+    // newlines. As clang-format must not change the string's content, it is
+    // unlikely that we'll end up with a better format.
+    if (Current.IsMultiline)
+      return addMultilineStringLiteral(Current, State);
+
     // Only break up default narrow strings.
     if (!Current.TokenText.startswith("\""))
       return 0;
-    // Don't break string literals with escaped newlines. As clang-format must
-    // not change the string's content, it is unlikely that we'll end up with
-    // a better format.
-    if (Current.TokenText.find("\\\n") != StringRef::npos)
-      return 0;
     // Exempts unterminated string literals from line breaking. The user will
     // likely want to terminate the string before any line breaking is done.
     if (Current.IsUnterminatedLiteral)
@@ -608,6 +632,9 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current,
     Token.reset(new BreakableStringLiteral(Current, StartColumn,
                                            Line.InPPDirective, Encoding));
   } else if (Current.Type == TT_BlockComment && Current.isTrailingComment()) {
+    unsigned OriginalStartColumn =
+        SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) -
+        1;
     Token.reset(new BreakableBlockComment(
         Style, Current, StartColumn, OriginalStartColumn, !Current.Previous,
         Line.InPPDirective, Encoding));
@@ -621,8 +648,9 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current,
     // FIXME: If we want to handle them correctly, we'll need to adjust
     // leading whitespace in consecutive lines when changing indentation of
     // the first line similar to what we do with block comments.
-    StringRef::size_type EscapedNewlinePos = Current.TokenText.find("\\\n");
-    if (EscapedNewlinePos != StringRef::npos) {
+    if (Current.IsMultiline) {
+      StringRef::size_type EscapedNewlinePos = Current.TokenText.find("\\\n");
+      assert(EscapedNewlinePos != StringRef::npos);
       State.Column =
           StartColumn +
           encoding::getCodePointCount(
@@ -707,14 +735,19 @@ bool ContinuationIndenter::NextIsMultilineString(const LineState &State) {
   const FormatToken &Current = *State.NextToken;
   if (!Current.is(tok::string_literal))
     return false;
+  // We never consider raw string literals "multiline" for the purpose of
+  // AlwaysBreakBeforeMultilineStrings implementation.
+  if (Current.TokenText.startswith("R\""))
+    return false;
+  if (Current.IsMultiline)
+    return true;
   if (Current.getNextNonComment() &&
       Current.getNextNonComment()->is(tok::string_literal))
     return true; // Implicit concatenation.
   if (State.Column + Current.CodePointCount + Current.UnbreakableTailLength >
       Style.ColumnLimit)
     return true; // String will be split.
-  // String literal might have escaped newlines.
-  return Current.TokenText.find("\\\n") != StringRef::npos;
+  return false;
 }
 
 } // namespace format
diff --git a/lib/Format/ContinuationIndenter.h b/lib/Format/ContinuationIndenter.h
index 81d14ad0a2..70b87bb2fb 100644
--- a/lib/Format/ContinuationIndenter.h
+++ b/lib/Format/ContinuationIndenter.h
@@ -84,6 +84,14 @@ private:
   unsigned breakProtrudingToken(const FormatToken &Current, LineState &State,
                                 bool DryRun);
 
+  /// \brief Adds a multiline string literal to the \p State.
+  ///
+  /// \returns Extra penalty for the first line of the literal: last line is
+  /// handled in \c addNextStateToQueue, and the penalty for other lines doesn't
+  /// matter, as we don't change them.
+  unsigned addMultilineStringLiteral(const FormatToken &Current,
+                                     LineState &State);
+
   /// \brief Returns \c true if the next token starts a multiline string
   /// literal.
   ///
diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp
index 3c740d9d84..84bf36c7fd 100644
--- a/lib/Format/Format.cpp
+++ b/lib/Format/Format.cpp
@@ -596,8 +596,11 @@ private:
     FormatTok->CodePointCount =
         encoding::getCodePointCount(FormatTok->TokenText, Encoding);
 
-    // FIXME: Add the CodePointCount to Column.
+    if (FormatTok->isOneOf(tok::string_literal, tok::comment) &&
+        FormatTok->TokenText.find('\n') != StringRef::npos)
+      FormatTok->IsMultiline = true;
 
+    // FIXME: Add the CodePointCount to Column.
     FormatTok->WhitespaceRange = SourceRange(
         WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
     return FormatTok;
diff --git a/lib/Format/FormatToken.h b/lib/Format/FormatToken.h
index 950938395e..6006ec87c0 100644
--- a/lib/Format/FormatToken.h
+++ b/lib/Format/FormatToken.h
@@ -80,14 +80,14 @@ class TokenRole;
 /// whitespace characters preceeding it.
 struct FormatToken {
   FormatToken()
-      : NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0),
-        CodePointCount(0), IsFirst(false), MustBreakBefore(false),
-        IsUnterminatedLiteral(false), BlockKind(BK_Unknown), Type(TT_Unknown),
-        SpacesRequiredBefore(0), CanBreakBefore(false),
-        ClosesTemplateDeclaration(false), ParameterCount(0),
-        PackingKind(PPK_Inconclusive), TotalLength(0), UnbreakableTailLength(0),
-        BindingStrength(0), SplitPenalty(0), LongestObjCSelectorName(0),
-        FakeRParens(0), LastInChainOfCalls(false),
+      : NewlinesBefore(0), HasUnescapedNewline(false), IsMultiline(false),
+        LastNewlineOffset(0), CodePointCount(0), IsFirst(false),
+        MustBreakBefore(false), IsUnterminatedLiteral(false),
+        BlockKind(BK_Unknown), Type(TT_Unknown), SpacesRequiredBefore(0),
+        CanBreakBefore(false), ClosesTemplateDeclaration(false),
+        ParameterCount(0), PackingKind(PPK_Inconclusive), TotalLength(0),
+        UnbreakableTailLength(0), BindingStrength(0), SplitPenalty(0),
+        LongestObjCSelectorName(0), FakeRParens(0), LastInChainOfCalls(false),
         PartOfMultiVariableDeclStmt(false), MatchingParen(NULL), Previous(NULL),
         Next(NULL) {}
 
@@ -104,6 +104,9 @@ struct FormatToken {
   /// Token.
   bool HasUnescapedNewline;
 
+  /// \brief Whether the token text contains newlines (escaped or not).
+  bool IsMultiline;
+
   /// \brief The range of the whitespace immediately preceeding the \c Token.
   SourceRange WhitespaceRange;
 
diff --git a/lib/Format/TokenAnnotator.cpp b/lib/Format/TokenAnnotator.cpp
index b634bbdbf6..ce837d42cc 100644
--- a/lib/Format/TokenAnnotator.cpp
+++ b/lib/Format/TokenAnnotator.cpp
@@ -1024,8 +1024,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) {
     Current->CanBreakBefore =
         Current->MustBreakBefore || canBreakBefore(Line, *Current);
     if (Current->MustBreakBefore ||
-        (Current->is(tok::string_literal) &&
-         Current->TokenText.find("\\\n") != StringRef::npos))
+        (Current->is(tok::string_literal) && Current->IsMultiline))
       Current->TotalLength = Current->Previous->TotalLength + Style.ColumnLimit;
     else
       Current->TotalLength = Current->Previous->TotalLength +
diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp
index b20007d1e0..c5e0bcdb55 100644
--- a/unittests/Format/FormatTest.cpp
+++ b/unittests/Format/FormatTest.cpp
@@ -5340,13 +5340,53 @@ TEST_F(FormatTest, BreakStringLiterals) {
 }
 
 TEST_F(FormatTest, DontSplitStringLiteralsWithEscapedNewlines) {
-  EXPECT_EQ("aaaaaaaaaaa =\n"
-            "    \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
-            "  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
-            "  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\";",
-            format("aaaaaaaaaaa = \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
-                   "  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
-                   "  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\";"));
+  EXPECT_EQ(
+      "aaaaaaaaaaa = \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
+      "  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
+      "  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\";",
+      format("aaaaaaaaaaa  =  \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
+             "  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
+             "  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\";"));
+}
+
+TEST_F(FormatTest, CountsCharactersInMultilineRawStringLiterals) {
+  EXPECT_EQ("f(g(R\"x(raw literal)x\", a), b);",
+            format("f(g(R\"x(raw literal)x\", a), b);", getGoogleStyle()));
+  EXPECT_EQ("fffffffffff(g(R\"x(\n"
+            "multiline raw string literal xxxxxxxxxxxxxx\n"
+            ")x\",\n"
+            "              a),\n"
+            "            b);",
+            format("fffffffffff(g(R\"x(\n"
+                   "multiline raw string literal xxxxxxxxxxxxxx\n"
+                   ")x\", a), b);",
+                   getGoogleStyleWithColumns(20)));
+  EXPECT_EQ("fffffffffff(\n"
+            "    g(R\"x(qqq\n"
+            "multiline raw string literal xxxxxxxxxxxxxx\n"
+            ")x\",\n"
+            "      a),\n"
+            "    b);",
+            format("fffffffffff(g(R\"x(qqq\n"
+                   "multiline raw string literal xxxxxxxxxxxxxx\n"
+                   ")x\", a), b);",
+                   getGoogleStyleWithColumns(20)));
+
+  EXPECT_EQ("fffffffffff(R\"x(\n"
+            "multiline raw string literal xxxxxxxxxxxxxx\n"
+            ")x\");",
+            format("fffffffffff(R\"x(\n"
+                   "multiline raw string literal xxxxxxxxxxxxxx\n"
+                   ")x\");",
+                   getGoogleStyleWithColumns(20)));
+  EXPECT_EQ("fffffffffff(R\"x(\n"
+            "multiline raw string literal xxxxxxxxxxxxxx\n"
+            ")x\" +\n"
+            "            bbbbbb);",
+            format("fffffffffff(R\"x(\n"
+                   "multiline raw string literal xxxxxxxxxxxxxx\n"
+                   ")x\" + bbbbbb);",
+                   getGoogleStyleWithColumns(20)));
 }
 
 TEST_F(FormatTest, SkipsUnknownStringLiterals) {