From 83a7dcdf5fce1bdf74ce985419d77a41a51abfa2 Mon Sep 17 00:00:00 2001 From: Alexander Kornienko Date: Tue, 10 Sep 2013 09:38:25 +0000 Subject: [PATCH] Calculate and store ColumnWidth instead of CodePointCount in FormatTokens. Summary: This fixes various issues with mixed tabs and spaces handling, e.g. when realigning block comments. Reviewers: klimek, djasper Reviewed By: djasper CC: cfe-commits Differential Revision: http://llvm-reviews.chandlerc.com/D1608 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@190395 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Format/BreakableToken.cpp | 6 ++- lib/Format/ContinuationIndenter.cpp | 42 ++++++++---------- lib/Format/Format.cpp | 45 +++++++++++-------- lib/Format/FormatToken.cpp | 4 +- lib/Format/FormatToken.h | 27 ++++------- lib/Format/TokenAnnotator.cpp | 13 +++--- unittests/Format/FormatTest.cpp | 69 ++++++++++++++++++----------- 7 files changed, 109 insertions(+), 97 deletions(-) diff --git a/lib/Format/BreakableToken.cpp b/lib/Format/BreakableToken.cpp index 053be4b025..566769afea 100644 --- a/lib/Format/BreakableToken.cpp +++ b/lib/Format/BreakableToken.cpp @@ -337,8 +337,10 @@ void BreakableBlockComment::adjustWhitespace(unsigned LineIndex, // if leading tabs are intermixed with spaces, that is not a high priority. // Adjust the start column uniformly accross all lines. - StartOfLineColumn[LineIndex] = - std::max(0, Whitespace.size() + IndentDelta); + StartOfLineColumn[LineIndex] = std::max( + 0, + encoding::columnWidthWithTabs(Whitespace, 0, Style.TabWidth, Encoding) + + IndentDelta); } unsigned BreakableBlockComment::getLineCount() const { return Lines.size(); } diff --git a/lib/Format/ContinuationIndenter.cpp b/lib/Format/ContinuationIndenter.cpp index 40d9d2f7d2..0b380041df 100644 --- a/lib/Format/ContinuationIndenter.cpp +++ b/lib/Format/ContinuationIndenter.cpp @@ -201,7 +201,7 @@ unsigned ContinuationIndenter::addTokenToState(LineState &State, bool Newline, State.NextToken->WhitespaceRange.getEnd()) - SourceMgr.getSpellingColumnNumber( State.NextToken->WhitespaceRange.getBegin()); - State.Column += WhitespaceLength + State.NextToken->CodePointCount; + State.Column += WhitespaceLength + State.NextToken->ColumnWidth; State.NextToken = State.NextToken->Next; return 0; } @@ -257,11 +257,11 @@ unsigned ContinuationIndenter::addTokenToState(LineState &State, bool Newline, State.Line->StartsDefinition))) { State.Column = State.Stack.back().Indent; } else if (Current.Type == TT_ObjCSelectorName) { - if (State.Stack.back().ColonPos > Current.CodePointCount) { - State.Column = State.Stack.back().ColonPos - Current.CodePointCount; + if (State.Stack.back().ColonPos > Current.ColumnWidth) { + State.Column = State.Stack.back().ColonPos - Current.ColumnWidth; } else { State.Column = State.Stack.back().Indent; - State.Stack.back().ColonPos = State.Column + Current.CodePointCount; + State.Stack.back().ColonPos = State.Column + Current.ColumnWidth; } } else if (Current.is(tok::l_square) && Current.Type != TT_ObjCMethodExpr && Current.Type != TT_LambdaLSquare) { @@ -307,7 +307,7 @@ unsigned ContinuationIndenter::addTokenToState(LineState &State, bool Newline, if (!Current.isTrailingComment()) State.Stack.back().LastSpace = State.Column; if (Current.isMemberAccess()) - State.Stack.back().LastSpace += Current.CodePointCount; + State.Stack.back().LastSpace += Current.ColumnWidth; State.StartOfLineLevel = State.ParenLevel; State.LowestLevelOnLine = State.ParenLevel; @@ -343,8 +343,8 @@ unsigned ContinuationIndenter::addTokenToState(LineState &State, bool Newline, State.Stack.back().VariablePos = State.Column; // Move over * and & if they are bound to the variable name. const FormatToken *Tok = &Previous; - while (Tok && State.Stack.back().VariablePos >= Tok->CodePointCount) { - State.Stack.back().VariablePos -= Tok->CodePointCount; + while (Tok && State.Stack.back().VariablePos >= Tok->ColumnWidth) { + State.Stack.back().VariablePos -= Tok->ColumnWidth; if (Tok->SpacesRequiredBefore != 0) break; Tok = Tok->Previous; @@ -361,12 +361,12 @@ unsigned ContinuationIndenter::addTokenToState(LineState &State, bool Newline, if (Current.Type == TT_ObjCSelectorName && State.Stack.back().ColonPos == 0) { if (State.Stack.back().Indent + Current.LongestObjCSelectorName > - State.Column + Spaces + Current.CodePointCount) + State.Column + Spaces + Current.ColumnWidth) State.Stack.back().ColonPos = State.Stack.back().Indent + Current.LongestObjCSelectorName; else State.Stack.back().ColonPos = - State.Column + Spaces + Current.CodePointCount; + State.Column + Spaces + Current.ColumnWidth; } if (Previous.opensScope() && Previous.Type != TT_ObjCMethodExpr && @@ -436,7 +436,7 @@ unsigned ContinuationIndenter::moveStateToNextToken(LineState &State, std::min(State.LowestLevelOnLine, State.ParenLevel); if (Current.isMemberAccess()) State.Stack.back().StartOfFunctionCall = - Current.LastInChainOfCalls ? 0 : State.Column + Current.CodePointCount; + Current.LastInChainOfCalls ? 0 : State.Column + Current.ColumnWidth; if (Current.Type == TT_CtorInitializerColon) { // Indent 2 from the column, so: // SomeClass::SomeClass() @@ -592,7 +592,7 @@ unsigned ContinuationIndenter::moveStateToNextToken(LineState &State, State.StartOfStringLiteral = 0; } - State.Column += Current.CodePointCount; + State.Column += Current.ColumnWidth; State.NextToken = State.NextToken->Next; unsigned Penalty = breakProtrudingToken(Current, State, DryRun); if (State.Column > getColumnLimit(State)) { @@ -618,8 +618,7 @@ ContinuationIndenter::addMultilineStringLiteral(const FormatToken &Current, for (unsigned i = 0, e = State.Stack.size(); i != e; ++i) State.Stack[i].BreakBeforeParameter = true; - unsigned ColumnsUsed = - State.Column - Current.CodePointCount + Current.FirstLineColumnWidth; + unsigned ColumnsUsed = State.Column; // We can only affect layout of the first and the last line, so the penalty // for all other lines is constant, and we ignore it. State.Column = Current.LastLineColumnWidth; @@ -636,14 +635,14 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current, return 0; llvm::OwningPtr Token; - unsigned StartColumn = State.Column - Current.CodePointCount; + unsigned StartColumn = State.Column - Current.ColumnWidth; if (Current.is(tok::string_literal) && Current.Type != TT_ImplicitStringLiteral) { // Don't break string literals with (in case of non-raw strings, escaped) // newlines. As clang-format must not change the string's content, it is // unlikely that we'll end up with a better format. - if (Current.isMultiline()) + if (Current.IsMultiline) return addMultilineStringLiteral(Current, State); // Only break up default narrow strings. @@ -657,11 +656,8 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current, Token.reset(new BreakableStringLiteral( Current, StartColumn, State.Line->InPPDirective, Encoding, Style)); } else if (Current.Type == TT_BlockComment && Current.isTrailingComment()) { - unsigned OriginalStartColumn = - SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) - - 1; Token.reset(new BreakableBlockComment( - Current, StartColumn, OriginalStartColumn, !Current.Previous, + Current, StartColumn, Current.OriginalColumn, !Current.Previous, State.Line->InPPDirective, Encoding, Style)); } else if (Current.Type == TT_LineComment && (Current.Previous == NULL || @@ -673,10 +669,8 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current, // FIXME: If we want to handle them correctly, we'll need to adjust // leading whitespace in consecutive lines when changing indentation of // the first line similar to what we do with block comments. - if (Current.isMultiline()) { - State.Column = StartColumn + Current.FirstLineColumnWidth; + if (Current.IsMultiline) return 0; - } Token.reset(new BreakableLineComment( Current, StartColumn, State.Line->InPPDirective, Encoding, Style)); @@ -759,12 +753,12 @@ bool ContinuationIndenter::NextIsMultilineString(const LineState &State) { // AlwaysBreakBeforeMultilineStrings implementation. if (Current.TokenText.startswith("R\"")) return false; - if (Current.isMultiline()) + if (Current.IsMultiline) return true; if (Current.getNextNonComment() && Current.getNextNonComment()->is(tok::string_literal)) return true; // Implicit concatenation. - if (State.Column + Current.CodePointCount + Current.UnbreakableTailLength > + if (State.Column + Current.ColumnWidth + Current.UnbreakableTailLength > Style.ColumnLimit) return true; // String will be split. return false; diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp index fe6660b3bc..3a0802a460 100644 --- a/lib/Format/Format.cpp +++ b/lib/Format/Format.cpp @@ -610,7 +610,7 @@ private: FormatTok->WhitespaceRange = SourceRange(GreaterLocation, GreaterLocation); FormatTok->TokenText = ">"; - FormatTok->CodePointCount = 1; + FormatTok->ColumnWidth = 1; GreaterStashed = false; return FormatTok; } @@ -666,6 +666,10 @@ private: Column = 0; FormatTok->TokenText = FormatTok->TokenText.substr(2); } + + FormatTok->WhitespaceRange = SourceRange( + WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength)); + FormatTok->OriginalColumn = Column; TrailingWhitespace = 0; @@ -685,24 +689,29 @@ private: } // Now FormatTok is the next non-whitespace token. - FormatTok->CodePointCount = - encoding::getCodePointCount(FormatTok->TokenText, Encoding); - - if (FormatTok->isOneOf(tok::string_literal, tok::comment)) { - StringRef Text = FormatTok->TokenText; - size_t FirstNewlinePos = Text.find('\n'); - if (FirstNewlinePos != StringRef::npos) { - // FIXME: Handle embedded tabs. - FormatTok->FirstLineColumnWidth = encoding::columnWidthWithTabs( - Text.substr(0, FirstNewlinePos), 0, Style.TabWidth, Encoding); - FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs( - Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, - Encoding); - } + + StringRef Text = FormatTok->TokenText; + size_t FirstNewlinePos = Text.find('\n'); + if (FirstNewlinePos != StringRef::npos) { + FormatTok->IsMultiline = true; + // The last line of the token always starts in column 0. + // Thus, the length can be precomputed even in the presence of tabs. + FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs( + Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, + Encoding); + Text = Text.substr(0, FirstNewlinePos); } - // FIXME: Add the CodePointCount to Column. - FormatTok->WhitespaceRange = SourceRange( - WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength)); + + // FIXME: ColumnWidth actually depends on the start column, we need to + // take this into account when the token is moved. + FormatTok->ColumnWidth = + encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding); + + // FIXME: For multi-line tokens this should be LastLineColumnWidth. + // For line comments this should probably be zero. But before changing, + // we need good tests for this. + Column += FormatTok->ColumnWidth; + return FormatTok; } diff --git a/lib/Format/FormatToken.cpp b/lib/Format/FormatToken.cpp index e6c72ab0de..49e0ce8337 100644 --- a/lib/Format/FormatToken.cpp +++ b/lib/Format/FormatToken.cpp @@ -42,7 +42,7 @@ unsigned CommaSeparatedList::format(LineState &State, // Calculate the number of code points we have to format this list. As the // first token is already placed, we have to subtract it. unsigned RemainingCodePoints = Style.ColumnLimit - State.Column + - State.NextToken->Previous->CodePointCount; + State.NextToken->Previous->ColumnWidth; // Find the best ColumnFormat, i.e. the best number of columns to use. const ColumnFormat *Format = getColumnFormat(RemainingCodePoints); @@ -82,7 +82,7 @@ unsigned CommaSeparatedList::format(LineState &State, static unsigned CodePointsBetween(const FormatToken *Begin, const FormatToken *End) { assert(End->TotalLength >= Begin->TotalLength); - return End->TotalLength - Begin->TotalLength + Begin->CodePointCount; + return End->TotalLength - Begin->TotalLength + Begin->ColumnWidth; } void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) { diff --git a/lib/Format/FormatToken.h b/lib/Format/FormatToken.h index e374f835d1..c02ac6357a 100644 --- a/lib/Format/FormatToken.h +++ b/lib/Format/FormatToken.h @@ -83,7 +83,7 @@ class AnnotatedLine; struct FormatToken { FormatToken() : NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0), - CodePointCount(0), FirstLineColumnWidth(0), LastLineColumnWidth(0), + ColumnWidth(0), LastLineColumnWidth(0), IsMultiline(false), IsFirst(false), MustBreakBefore(false), IsUnterminatedLiteral(false), BlockKind(BK_Unknown), Type(TT_Unknown), SpacesRequiredBefore(0), CanBreakBefore(false), ClosesTemplateDeclaration(false), @@ -114,22 +114,17 @@ struct FormatToken { /// whitespace (relative to \c WhiteSpaceStart). 0 if there is no '\n'. unsigned LastNewlineOffset; - /// \brief The length of the non-whitespace parts of the token in CodePoints. + /// \brief The width of the non-whitespace parts of the token (or its first + /// line for multi-line tokens) in columns. /// We need this to correctly measure number of columns a token spans. - unsigned CodePointCount; + unsigned ColumnWidth; - /// \brief Contains the number of code points in the first line of a - /// multi-line string literal or comment. Zero if there's no newline in the + /// \brief Contains the width in columns of the last line of a multi-line /// token. - unsigned FirstLineColumnWidth; - - /// \brief Contains the number of code points in the last line of a - /// multi-line string literal or comment. Can be zero for line comments. unsigned LastLineColumnWidth; - /// \brief Returns \c true if the token text contains newlines (escaped or - /// not). - bool isMultiline() const { return FirstLineColumnWidth != 0; } + /// \brief Whether the token text contains newlines (escaped or not). + bool IsMultiline; /// \brief Indicates that this is the first token. bool IsFirst; @@ -189,12 +184,8 @@ struct FormatToken { /// token. unsigned TotalLength; - /// \brief The original column of this token, including expanded tabs. - /// The configured IndentWidth is used as tab width. Only tabs in whitespace - /// are expanded. - /// FIXME: This is currently only used on the first token of an unwrapped - /// line, and the implementation is not correct for other tokens (see the - /// FIXMEs in FormatTokenLexer::getNextToken()). + /// \brief The original 0-based column of this token, including expanded tabs. + /// The configured TabWidth is used as tab width. unsigned OriginalColumn; /// \brief The length of following tokens until the next natural split point, diff --git a/lib/Format/TokenAnnotator.cpp b/lib/Format/TokenAnnotator.cpp index e3cd2105e6..c0897ca68c 100644 --- a/lib/Format/TokenAnnotator.cpp +++ b/lib/Format/TokenAnnotator.cpp @@ -326,10 +326,10 @@ private: Line.First->Type == TT_ObjCMethodSpecifier) { Tok->Type = TT_ObjCMethodExpr; Tok->Previous->Type = TT_ObjCSelectorName; - if (Tok->Previous->CodePointCount > + if (Tok->Previous->ColumnWidth > Contexts.back().LongestObjCSelectorName) { Contexts.back().LongestObjCSelectorName = - Tok->Previous->CodePointCount; + Tok->Previous->ColumnWidth; } if (Contexts.back().FirstObjCSelectorName == NULL) Contexts.back().FirstObjCSelectorName = Tok->Previous; @@ -1022,7 +1022,8 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) { } void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) { - Line.First->TotalLength = Line.First->CodePointCount; + Line.First->TotalLength = + Line.First->IsMultiline ? Style.ColumnLimit : Line.First->ColumnWidth; if (!Line.First->Next) return; FormatToken *Current = Line.First->Next; @@ -1055,11 +1056,11 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) { Current->CanBreakBefore = Current->MustBreakBefore || canBreakBefore(Line, *Current); if (Current->MustBreakBefore || !Current->Children.empty() || - (Current->is(tok::string_literal) && Current->isMultiline())) + Current->IsMultiline) Current->TotalLength = Current->Previous->TotalLength + Style.ColumnLimit; else Current->TotalLength = Current->Previous->TotalLength + - Current->CodePointCount + + Current->ColumnWidth + Current->SpacesRequiredBefore; // FIXME: Only calculate this if CanBreakBefore is true once static // initializers etc. are sorted out. @@ -1095,7 +1096,7 @@ void TokenAnnotator::calculateUnbreakableTailLengths(AnnotatedLine &Line) { UnbreakableTailLength = 0; } else { UnbreakableTailLength += - Current->CodePointCount + Current->SpacesRequiredBefore; + Current->ColumnWidth + Current->SpacesRequiredBefore; } Current = Current->Previous; } diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp index ef7d63c1eb..8ae956e4fd 100644 --- a/unittests/Format/FormatTest.cpp +++ b/unittests/Format/FormatTest.cpp @@ -5738,33 +5738,48 @@ TEST_F(FormatTest, ConfigurableUseOfTab) { // FIXME: To correctly count mixed whitespace we need to // also correctly count mixed whitespace in front of the comment. - // Tab.TabWidth = 8; - // Tab.IndentWidth = 8; - // EXPECT_EQ("/*\n" - // "\t a\t\tcomment\n" - // "\t in multiple lines\n" - // " */", - // format(" /*\t \t \n" - // " \t \t a\t\tcomment\t \t\n" - // " \t \t in multiple lines\t\n" - // " \t */", - // Tab)); - // Tab.UseTab = false; - // EXPECT_EQ("/*\n" - // " a\t\tcomment\n" - // " in multiple lines\n" - // " */", - // format(" /*\t \t \n" - // " \t \t a\t\tcomment\t \t\n" - // " \t \t in multiple lines\t\n" - // " \t */", - // Tab)); - // EXPECT_EQ("/* some\n" - // " comment */", - // format(" \t \t /* some\n" - // " \t \t comment */", - // Tab)); - + Tab.TabWidth = 8; + Tab.IndentWidth = 8; + EXPECT_EQ("/*\n" + "\t a\t\tcomment\n" + "\t in multiple lines\n" + " */", + format(" /*\t \t \n" + " \t \t a\t\tcomment\t \t\n" + " \t \t in multiple lines\t\n" + " \t */", + Tab)); + Tab.UseTab = false; + EXPECT_EQ("/*\n" + " a\t\tcomment\n" + " in multiple lines\n" + " */", + format(" /*\t \t \n" + " \t \t a\t\tcomment\t \t\n" + " \t \t in multiple lines\t\n" + " \t */", + Tab)); + EXPECT_EQ("/* some\n" + " comment */", + format(" \t \t /* some\n" + " \t \t comment */", + Tab)); + EXPECT_EQ("int a; /* some\n" + " comment */", + format(" \t \t int a; /* some\n" + " \t \t comment */", + Tab)); + + EXPECT_EQ("int a; /* some\n" + "comment */", + format(" \t \t int\ta; /* some\n" + " \t \t comment */", + Tab)); + EXPECT_EQ("f(\"\t\t\"); /* some\n" + " comment */", + format(" \t \t f(\"\t\t\"); /* some\n" + " \t \t comment */", + Tab)); EXPECT_EQ("{\n" " /*\n" " * Comment\n" -- 2.40.0