From: Alexander Kornienko Date: Thu, 29 Aug 2013 17:32:57 +0000 (+0000) Subject: Better support for multiline string literals (including C++11 raw string literals). X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=dcc0c5bb7ce9a731ecbc0b8e8477979cd9e730c0;p=clang Better support for multiline string literals (including C++11 raw string literals). Summary: Calculate characters in the first and the last line correctly so that we only break before the literal when needed. Reviewers: djasper Reviewed By: djasper CC: cfe-commits, klimek Differential Revision: http://llvm-reviews.chandlerc.com/D1544 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@189595 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Format/ContinuationIndenter.cpp b/lib/Format/ContinuationIndenter.cpp index f4ae5cce01..718e4a5cde 100644 --- a/lib/Format/ContinuationIndenter.cpp +++ b/lib/Format/ContinuationIndenter.cpp @@ -579,6 +579,31 @@ unsigned ContinuationIndenter::moveStateToNextToken(LineState &State, return Penalty; } +unsigned +ContinuationIndenter::addMultilineStringLiteral(const FormatToken &Current, + LineState &State) { + StringRef Text = Current.TokenText; + // We can only affect layout of the first and the last line, so the penalty + // for all other lines is constant, and we ignore it. + size_t FirstLineBreak = Text.find('\n'); + size_t LastLineBreak = Text.find_last_of('\n'); + assert(FirstLineBreak != StringRef::npos); + unsigned StartColumn = State.Column - Current.CodePointCount; + State.Column = + encoding::getCodePointCount(Text.substr(LastLineBreak + 1), Encoding); + + // Break before further function parameters on all levels. + for (unsigned i = 0, e = State.Stack.size(); i != e; ++i) + State.Stack[i].BreakBeforeParameter = true; + + unsigned ColumnsUsed = + StartColumn + + encoding::getCodePointCount(Text.substr(0, FirstLineBreak), Encoding); + if (ColumnsUsed > getColumnLimit()) + return Style.PenaltyExcessCharacter * (ColumnsUsed - getColumnLimit()); + return 0; +} + unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current, LineState &State, bool DryRun) { @@ -587,19 +612,18 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current, llvm::OwningPtr Token; unsigned StartColumn = State.Column - Current.CodePointCount; - unsigned OriginalStartColumn = - SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) - 1; if (Current.is(tok::string_literal) && Current.Type != TT_ImplicitStringLiteral) { + // Don't break string literals with (in case of non-raw strings, escaped) + // newlines. As clang-format must not change the string's content, it is + // unlikely that we'll end up with a better format. + if (Current.IsMultiline) + return addMultilineStringLiteral(Current, State); + // Only break up default narrow strings. if (!Current.TokenText.startswith("\"")) return 0; - // Don't break string literals with escaped newlines. As clang-format must - // not change the string's content, it is unlikely that we'll end up with - // a better format. - if (Current.TokenText.find("\\\n") != StringRef::npos) - return 0; // Exempts unterminated string literals from line breaking. The user will // likely want to terminate the string before any line breaking is done. if (Current.IsUnterminatedLiteral) @@ -608,6 +632,9 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current, Token.reset(new BreakableStringLiteral(Current, StartColumn, Line.InPPDirective, Encoding)); } else if (Current.Type == TT_BlockComment && Current.isTrailingComment()) { + unsigned OriginalStartColumn = + SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) - + 1; Token.reset(new BreakableBlockComment( Style, Current, StartColumn, OriginalStartColumn, !Current.Previous, Line.InPPDirective, Encoding)); @@ -621,8 +648,9 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current, // FIXME: If we want to handle them correctly, we'll need to adjust // leading whitespace in consecutive lines when changing indentation of // the first line similar to what we do with block comments. - StringRef::size_type EscapedNewlinePos = Current.TokenText.find("\\\n"); - if (EscapedNewlinePos != StringRef::npos) { + if (Current.IsMultiline) { + StringRef::size_type EscapedNewlinePos = Current.TokenText.find("\\\n"); + assert(EscapedNewlinePos != StringRef::npos); State.Column = StartColumn + encoding::getCodePointCount( @@ -707,14 +735,19 @@ bool ContinuationIndenter::NextIsMultilineString(const LineState &State) { const FormatToken &Current = *State.NextToken; if (!Current.is(tok::string_literal)) return false; + // We never consider raw string literals "multiline" for the purpose of + // AlwaysBreakBeforeMultilineStrings implementation. + if (Current.TokenText.startswith("R\"")) + return false; + if (Current.IsMultiline) + return true; if (Current.getNextNonComment() && Current.getNextNonComment()->is(tok::string_literal)) return true; // Implicit concatenation. if (State.Column + Current.CodePointCount + Current.UnbreakableTailLength > Style.ColumnLimit) return true; // String will be split. - // String literal might have escaped newlines. - return Current.TokenText.find("\\\n") != StringRef::npos; + return false; } } // namespace format diff --git a/lib/Format/ContinuationIndenter.h b/lib/Format/ContinuationIndenter.h index 81d14ad0a2..70b87bb2fb 100644 --- a/lib/Format/ContinuationIndenter.h +++ b/lib/Format/ContinuationIndenter.h @@ -84,6 +84,14 @@ private: unsigned breakProtrudingToken(const FormatToken &Current, LineState &State, bool DryRun); + /// \brief Adds a multiline string literal to the \p State. + /// + /// \returns Extra penalty for the first line of the literal: last line is + /// handled in \c addNextStateToQueue, and the penalty for other lines doesn't + /// matter, as we don't change them. + unsigned addMultilineStringLiteral(const FormatToken &Current, + LineState &State); + /// \brief Returns \c true if the next token starts a multiline string /// literal. /// diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp index 3c740d9d84..84bf36c7fd 100644 --- a/lib/Format/Format.cpp +++ b/lib/Format/Format.cpp @@ -596,8 +596,11 @@ private: FormatTok->CodePointCount = encoding::getCodePointCount(FormatTok->TokenText, Encoding); - // FIXME: Add the CodePointCount to Column. + if (FormatTok->isOneOf(tok::string_literal, tok::comment) && + FormatTok->TokenText.find('\n') != StringRef::npos) + FormatTok->IsMultiline = true; + // FIXME: Add the CodePointCount to Column. FormatTok->WhitespaceRange = SourceRange( WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength)); return FormatTok; diff --git a/lib/Format/FormatToken.h b/lib/Format/FormatToken.h index 950938395e..6006ec87c0 100644 --- a/lib/Format/FormatToken.h +++ b/lib/Format/FormatToken.h @@ -80,14 +80,14 @@ class TokenRole; /// whitespace characters preceeding it. struct FormatToken { FormatToken() - : NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0), - CodePointCount(0), IsFirst(false), MustBreakBefore(false), - IsUnterminatedLiteral(false), BlockKind(BK_Unknown), Type(TT_Unknown), - SpacesRequiredBefore(0), CanBreakBefore(false), - ClosesTemplateDeclaration(false), ParameterCount(0), - PackingKind(PPK_Inconclusive), TotalLength(0), UnbreakableTailLength(0), - BindingStrength(0), SplitPenalty(0), LongestObjCSelectorName(0), - FakeRParens(0), LastInChainOfCalls(false), + : NewlinesBefore(0), HasUnescapedNewline(false), IsMultiline(false), + LastNewlineOffset(0), CodePointCount(0), IsFirst(false), + MustBreakBefore(false), IsUnterminatedLiteral(false), + BlockKind(BK_Unknown), Type(TT_Unknown), SpacesRequiredBefore(0), + CanBreakBefore(false), ClosesTemplateDeclaration(false), + ParameterCount(0), PackingKind(PPK_Inconclusive), TotalLength(0), + UnbreakableTailLength(0), BindingStrength(0), SplitPenalty(0), + LongestObjCSelectorName(0), FakeRParens(0), LastInChainOfCalls(false), PartOfMultiVariableDeclStmt(false), MatchingParen(NULL), Previous(NULL), Next(NULL) {} @@ -104,6 +104,9 @@ struct FormatToken { /// Token. bool HasUnescapedNewline; + /// \brief Whether the token text contains newlines (escaped or not). + bool IsMultiline; + /// \brief The range of the whitespace immediately preceeding the \c Token. SourceRange WhitespaceRange; diff --git a/lib/Format/TokenAnnotator.cpp b/lib/Format/TokenAnnotator.cpp index b634bbdbf6..ce837d42cc 100644 --- a/lib/Format/TokenAnnotator.cpp +++ b/lib/Format/TokenAnnotator.cpp @@ -1024,8 +1024,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) { Current->CanBreakBefore = Current->MustBreakBefore || canBreakBefore(Line, *Current); if (Current->MustBreakBefore || - (Current->is(tok::string_literal) && - Current->TokenText.find("\\\n") != StringRef::npos)) + (Current->is(tok::string_literal) && Current->IsMultiline)) Current->TotalLength = Current->Previous->TotalLength + Style.ColumnLimit; else Current->TotalLength = Current->Previous->TotalLength + diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp index b20007d1e0..c5e0bcdb55 100644 --- a/unittests/Format/FormatTest.cpp +++ b/unittests/Format/FormatTest.cpp @@ -5340,13 +5340,53 @@ TEST_F(FormatTest, BreakStringLiterals) { } TEST_F(FormatTest, DontSplitStringLiteralsWithEscapedNewlines) { - EXPECT_EQ("aaaaaaaaaaa =\n" - " \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" - " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" - " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\";", - format("aaaaaaaaaaa = \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" - " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" - " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\";")); + EXPECT_EQ( + "aaaaaaaaaaa = \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" + " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" + " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\";", + format("aaaaaaaaaaa = \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" + " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" + " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\";")); +} + +TEST_F(FormatTest, CountsCharactersInMultilineRawStringLiterals) { + EXPECT_EQ("f(g(R\"x(raw literal)x\", a), b);", + format("f(g(R\"x(raw literal)x\", a), b);", getGoogleStyle())); + EXPECT_EQ("fffffffffff(g(R\"x(\n" + "multiline raw string literal xxxxxxxxxxxxxx\n" + ")x\",\n" + " a),\n" + " b);", + format("fffffffffff(g(R\"x(\n" + "multiline raw string literal xxxxxxxxxxxxxx\n" + ")x\", a), b);", + getGoogleStyleWithColumns(20))); + EXPECT_EQ("fffffffffff(\n" + " g(R\"x(qqq\n" + "multiline raw string literal xxxxxxxxxxxxxx\n" + ")x\",\n" + " a),\n" + " b);", + format("fffffffffff(g(R\"x(qqq\n" + "multiline raw string literal xxxxxxxxxxxxxx\n" + ")x\", a), b);", + getGoogleStyleWithColumns(20))); + + EXPECT_EQ("fffffffffff(R\"x(\n" + "multiline raw string literal xxxxxxxxxxxxxx\n" + ")x\");", + format("fffffffffff(R\"x(\n" + "multiline raw string literal xxxxxxxxxxxxxx\n" + ")x\");", + getGoogleStyleWithColumns(20))); + EXPECT_EQ("fffffffffff(R\"x(\n" + "multiline raw string literal xxxxxxxxxxxxxx\n" + ")x\" +\n" + " bbbbbb);", + format("fffffffffff(R\"x(\n" + "multiline raw string literal xxxxxxxxxxxxxx\n" + ")x\" + bbbbbb);", + getGoogleStyleWithColumns(20))); } TEST_F(FormatTest, SkipsUnknownStringLiterals) {