From 83a7dcdf5fce1bdf74ce985419d77a41a51abfa2 Mon Sep 17 00:00:00 2001
From: Alexander Kornienko <alexfh@google.com>
Date: Tue, 10 Sep 2013 09:38:25 +0000
Subject: [PATCH] Calculate and store ColumnWidth instead of CodePointCount in
 FormatTokens.

Summary:
This fixes various issues with mixed tabs and spaces handling, e.g.
when realigning block comments.

Reviewers: klimek, djasper

Reviewed By: djasper

CC: cfe-commits

Differential Revision: http://llvm-reviews.chandlerc.com/D1608

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@190395 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Format/BreakableToken.cpp       |  6 ++-
 lib/Format/ContinuationIndenter.cpp | 42 ++++++++----------
 lib/Format/Format.cpp               | 45 +++++++++++--------
 lib/Format/FormatToken.cpp          |  4 +-
 lib/Format/FormatToken.h            | 27 ++++-------
 lib/Format/TokenAnnotator.cpp       | 13 +++---
 unittests/Format/FormatTest.cpp     | 69 ++++++++++++++++++-----------
 7 files changed, 109 insertions(+), 97 deletions(-)
diff --git a/lib/Format/BreakableToken.cpp b/lib/Format/BreakableToken.cpp
index 053be4b025..566769afea 100644
--- a/lib/Format/BreakableToken.cpp
+++ b/lib/Format/BreakableToken.cpp
@@ -337,8 +337,10 @@ void BreakableBlockComment::adjustWhitespace(unsigned LineIndex,
   // if leading tabs are intermixed with spaces, that is not a high priority.
 
   // Adjust the start column uniformly accross all lines.
-  StartOfLineColumn[LineIndex] =
-      std::max<int>(0, Whitespace.size() + IndentDelta);
+  StartOfLineColumn[LineIndex] = std::max<int>(
+      0,
+      encoding::columnWidthWithTabs(Whitespace, 0, Style.TabWidth, Encoding) +
+          IndentDelta);
 }
 
 unsigned BreakableBlockComment::getLineCount() const { return Lines.size(); }
diff --git a/lib/Format/ContinuationIndenter.cpp b/lib/Format/ContinuationIndenter.cpp
index 40d9d2f7d2..0b380041df 100644
--- a/lib/Format/ContinuationIndenter.cpp
+++ b/lib/Format/ContinuationIndenter.cpp
@@ -201,7 +201,7 @@ unsigned ContinuationIndenter::addTokenToState(LineState &State, bool Newline,
                                State.NextToken->WhitespaceRange.getEnd()) -
                            SourceMgr.getSpellingColumnNumber(
                                State.NextToken->WhitespaceRange.getBegin());
-    State.Column += WhitespaceLength + State.NextToken->CodePointCount;
+    State.Column += WhitespaceLength + State.NextToken->ColumnWidth;
     State.NextToken = State.NextToken->Next;
     return 0;
   }
@@ -257,11 +257,11 @@ unsigned ContinuationIndenter::addTokenToState(LineState &State, bool Newline,
                  State.Line->StartsDefinition))) {
       State.Column = State.Stack.back().Indent;
     } else if (Current.Type == TT_ObjCSelectorName) {
-      if (State.Stack.back().ColonPos > Current.CodePointCount) {
-        State.Column = State.Stack.back().ColonPos - Current.CodePointCount;
+      if (State.Stack.back().ColonPos > Current.ColumnWidth) {
+        State.Column = State.Stack.back().ColonPos - Current.ColumnWidth;
       } else {
         State.Column = State.Stack.back().Indent;
-        State.Stack.back().ColonPos = State.Column + Current.CodePointCount;
+        State.Stack.back().ColonPos = State.Column + Current.ColumnWidth;
       }
     } else if (Current.is(tok::l_square) && Current.Type != TT_ObjCMethodExpr &&
                Current.Type != TT_LambdaLSquare) {
@@ -307,7 +307,7 @@ unsigned ContinuationIndenter::addTokenToState(LineState &State, bool Newline,
     if (!Current.isTrailingComment())
       State.Stack.back().LastSpace = State.Column;
     if (Current.isMemberAccess())
-      State.Stack.back().LastSpace += Current.CodePointCount;
+      State.Stack.back().LastSpace += Current.ColumnWidth;
     State.StartOfLineLevel = State.ParenLevel;
     State.LowestLevelOnLine = State.ParenLevel;
 
@@ -343,8 +343,8 @@ unsigned ContinuationIndenter::addTokenToState(LineState &State, bool Newline,
       State.Stack.back().VariablePos = State.Column;
       // Move over * and & if they are bound to the variable name.
       const FormatToken *Tok = &Previous;
-      while (Tok && State.Stack.back().VariablePos >= Tok->CodePointCount) {
-        State.Stack.back().VariablePos -= Tok->CodePointCount;
+      while (Tok && State.Stack.back().VariablePos >= Tok->ColumnWidth) {
+        State.Stack.back().VariablePos -= Tok->ColumnWidth;
         if (Tok->SpacesRequiredBefore != 0)
           break;
         Tok = Tok->Previous;
@@ -361,12 +361,12 @@ unsigned ContinuationIndenter::addTokenToState(LineState &State, bool Newline,
     if (Current.Type == TT_ObjCSelectorName &&
         State.Stack.back().ColonPos == 0) {
       if (State.Stack.back().Indent + Current.LongestObjCSelectorName >
-          State.Column + Spaces + Current.CodePointCount)
+          State.Column + Spaces + Current.ColumnWidth)
         State.Stack.back().ColonPos =
             State.Stack.back().Indent + Current.LongestObjCSelectorName;
       else
         State.Stack.back().ColonPos =
-            State.Column + Spaces + Current.CodePointCount;
+            State.Column + Spaces + Current.ColumnWidth;
     }
 
     if (Previous.opensScope() && Previous.Type != TT_ObjCMethodExpr &&
@@ -436,7 +436,7 @@ unsigned ContinuationIndenter::moveStateToNextToken(LineState &State,
         std::min(State.LowestLevelOnLine, State.ParenLevel);
   if (Current.isMemberAccess())
     State.Stack.back().StartOfFunctionCall =
-        Current.LastInChainOfCalls ? 0 : State.Column + Current.CodePointCount;
+        Current.LastInChainOfCalls ? 0 : State.Column + Current.ColumnWidth;
   if (Current.Type == TT_CtorInitializerColon) {
     // Indent 2 from the column, so:
     // SomeClass::SomeClass()
@@ -592,7 +592,7 @@ unsigned ContinuationIndenter::moveStateToNextToken(LineState &State,
     State.StartOfStringLiteral = 0;
   }
 
-  State.Column += Current.CodePointCount;
+  State.Column += Current.ColumnWidth;
   State.NextToken = State.NextToken->Next;
   unsigned Penalty = breakProtrudingToken(Current, State, DryRun);
   if (State.Column > getColumnLimit(State)) {
@@ -618,8 +618,7 @@ ContinuationIndenter::addMultilineStringLiteral(const FormatToken &Current,
   for (unsigned i = 0, e = State.Stack.size(); i != e; ++i)
     State.Stack[i].BreakBeforeParameter = true;
 
-  unsigned ColumnsUsed =
-      State.Column - Current.CodePointCount + Current.FirstLineColumnWidth;
+  unsigned ColumnsUsed = State.Column;
   // We can only affect layout of the first and the last line, so the penalty
   // for all other lines is constant, and we ignore it.
   State.Column = Current.LastLineColumnWidth;
@@ -636,14 +635,14 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current,
     return 0;
 
   llvm::OwningPtr<BreakableToken> Token;
-  unsigned StartColumn = State.Column - Current.CodePointCount;
+  unsigned StartColumn = State.Column - Current.ColumnWidth;
 
   if (Current.is(tok::string_literal) &&
       Current.Type != TT_ImplicitStringLiteral) {
     // Don't break string literals with (in case of non-raw strings, escaped)
     // newlines. As clang-format must not change the string's content, it is
     // unlikely that we'll end up with a better format.
-    if (Current.isMultiline())
+    if (Current.IsMultiline)
       return addMultilineStringLiteral(Current, State);
 
     // Only break up default narrow strings.
@@ -657,11 +656,8 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current,
     Token.reset(new BreakableStringLiteral(
         Current, StartColumn, State.Line->InPPDirective, Encoding, Style));
   } else if (Current.Type == TT_BlockComment && Current.isTrailingComment()) {
-    unsigned OriginalStartColumn =
-        SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) -
-        1;
     Token.reset(new BreakableBlockComment(
-        Current, StartColumn, OriginalStartColumn, !Current.Previous,
+        Current, StartColumn, Current.OriginalColumn, !Current.Previous,
         State.Line->InPPDirective, Encoding, Style));
   } else if (Current.Type == TT_LineComment &&
              (Current.Previous == NULL ||
@@ -673,10 +669,8 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current,
     // FIXME: If we want to handle them correctly, we'll need to adjust
     // leading whitespace in consecutive lines when changing indentation of
     // the first line similar to what we do with block comments.
-    if (Current.isMultiline()) {
-      State.Column = StartColumn + Current.FirstLineColumnWidth;
+    if (Current.IsMultiline)
       return 0;
-    }
 
     Token.reset(new BreakableLineComment(
         Current, StartColumn, State.Line->InPPDirective, Encoding, Style));
@@ -759,12 +753,12 @@ bool ContinuationIndenter::NextIsMultilineString(const LineState &State) {
   // AlwaysBreakBeforeMultilineStrings implementation.
   if (Current.TokenText.startswith("R\""))
     return false;
-  if (Current.isMultiline())
+  if (Current.IsMultiline)
     return true;
   if (Current.getNextNonComment() &&
       Current.getNextNonComment()->is(tok::string_literal))
     return true; // Implicit concatenation.
-  if (State.Column + Current.CodePointCount + Current.UnbreakableTailLength >
+  if (State.Column + Current.ColumnWidth + Current.UnbreakableTailLength >
       Style.ColumnLimit)
     return true; // String will be split.
   return false;
diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp
index fe6660b3bc..3a0802a460 100644
--- a/lib/Format/Format.cpp
+++ b/lib/Format/Format.cpp
@@ -610,7 +610,7 @@ private:
       FormatTok->WhitespaceRange =
           SourceRange(GreaterLocation, GreaterLocation);
       FormatTok->TokenText = ">";
-      FormatTok->CodePointCount = 1;
+      FormatTok->ColumnWidth = 1;
       GreaterStashed = false;
       return FormatTok;
     }
@@ -666,6 +666,10 @@ private:
       Column = 0;
       FormatTok->TokenText = FormatTok->TokenText.substr(2);
     }
+
+    FormatTok->WhitespaceRange = SourceRange(
+        WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
+
     FormatTok->OriginalColumn = Column;
 
     TrailingWhitespace = 0;
@@ -685,24 +689,29 @@ private:
     }
 
     // Now FormatTok is the next non-whitespace token.
-    FormatTok->CodePointCount =
-        encoding::getCodePointCount(FormatTok->TokenText, Encoding);
-
-    if (FormatTok->isOneOf(tok::string_literal, tok::comment)) {
-      StringRef Text = FormatTok->TokenText;
-      size_t FirstNewlinePos = Text.find('\n');
-      if (FirstNewlinePos != StringRef::npos) {
-        // FIXME: Handle embedded tabs.
-        FormatTok->FirstLineColumnWidth = encoding::columnWidthWithTabs(
-            Text.substr(0, FirstNewlinePos), 0, Style.TabWidth, Encoding);
-        FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
-            Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth,
-            Encoding);
-      }
+
+    StringRef Text = FormatTok->TokenText;
+    size_t FirstNewlinePos = Text.find('\n');
+    if (FirstNewlinePos != StringRef::npos) {
+      FormatTok->IsMultiline = true;
+      // The last line of the token always starts in column 0.
+      // Thus, the length can be precomputed even in the presence of tabs.
+      FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
+          Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth,
+          Encoding);
+      Text = Text.substr(0, FirstNewlinePos);
     }
-    // FIXME: Add the CodePointCount to Column.
-    FormatTok->WhitespaceRange = SourceRange(
-        WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
+
+    // FIXME: ColumnWidth actually depends on the start column, we need to
+    // take this into account when the token is moved.
+    FormatTok->ColumnWidth =
+        encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
+
+    // FIXME: For multi-line tokens this should be LastLineColumnWidth.
+    // For line comments this should probably be zero. But before changing,
+    // we need good tests for this.
+    Column += FormatTok->ColumnWidth;
+
     return FormatTok;
   }
 
diff --git a/lib/Format/FormatToken.cpp b/lib/Format/FormatToken.cpp
index e6c72ab0de..49e0ce8337 100644
--- a/lib/Format/FormatToken.cpp
+++ b/lib/Format/FormatToken.cpp
@@ -42,7 +42,7 @@ unsigned CommaSeparatedList::format(LineState &State,
   // Calculate the number of code points we have to format this list. As the
   // first token is already placed, we have to subtract it.
   unsigned RemainingCodePoints = Style.ColumnLimit - State.Column +
-                                 State.NextToken->Previous->CodePointCount;
+                                 State.NextToken->Previous->ColumnWidth;
 
   // Find the best ColumnFormat, i.e. the best number of columns to use.
   const ColumnFormat *Format = getColumnFormat(RemainingCodePoints);
@@ -82,7 +82,7 @@ unsigned CommaSeparatedList::format(LineState &State,
 static unsigned CodePointsBetween(const FormatToken *Begin,
                                   const FormatToken *End) {
   assert(End->TotalLength >= Begin->TotalLength);
-  return End->TotalLength - Begin->TotalLength + Begin->CodePointCount;
+  return End->TotalLength - Begin->TotalLength + Begin->ColumnWidth;
 }
 
 void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) {
diff --git a/lib/Format/FormatToken.h b/lib/Format/FormatToken.h
index e374f835d1..c02ac6357a 100644
--- a/lib/Format/FormatToken.h
+++ b/lib/Format/FormatToken.h
@@ -83,7 +83,7 @@ class AnnotatedLine;
 struct FormatToken {
   FormatToken()
       : NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0),
-        CodePointCount(0), FirstLineColumnWidth(0), LastLineColumnWidth(0),
+        ColumnWidth(0), LastLineColumnWidth(0), IsMultiline(false),
         IsFirst(false), MustBreakBefore(false), IsUnterminatedLiteral(false),
         BlockKind(BK_Unknown), Type(TT_Unknown), SpacesRequiredBefore(0),
         CanBreakBefore(false), ClosesTemplateDeclaration(false),
@@ -114,22 +114,17 @@ struct FormatToken {
   /// whitespace (relative to \c WhiteSpaceStart). 0 if there is no '\n'.
   unsigned LastNewlineOffset;
 
-  /// \brief The length of the non-whitespace parts of the token in CodePoints.
+  /// \brief The width of the non-whitespace parts of the token (or its first
+  /// line for multi-line tokens) in columns.
   /// We need this to correctly measure number of columns a token spans.
-  unsigned CodePointCount;
+  unsigned ColumnWidth;
 
-  /// \brief Contains the number of code points in the first line of a
-  /// multi-line string literal or comment. Zero if there's no newline in the
+  /// \brief Contains the width in columns of the last line of a multi-line
   /// token.
-  unsigned FirstLineColumnWidth;
-
-  /// \brief Contains the number of code points in the last line of a
-  /// multi-line string literal or comment. Can be zero for line comments.
   unsigned LastLineColumnWidth;
 
-  /// \brief Returns \c true if the token text contains newlines (escaped or
-  /// not).
-  bool isMultiline() const { return FirstLineColumnWidth != 0; }
+  /// \brief Whether the token text contains newlines (escaped or not).
+  bool IsMultiline;
 
   /// \brief Indicates that this is the first token.
   bool IsFirst;
@@ -189,12 +184,8 @@ struct FormatToken {
   /// token.
   unsigned TotalLength;
 
-  /// \brief The original column of this token, including expanded tabs.
-  /// The configured IndentWidth is used as tab width. Only tabs in whitespace
-  /// are expanded.
-  /// FIXME: This is currently only used on the first token of an unwrapped
-  /// line, and the implementation is not correct for other tokens (see the
-  /// FIXMEs in FormatTokenLexer::getNextToken()).
+  /// \brief The original 0-based column of this token, including expanded tabs.
+  /// The configured TabWidth is used as tab width.
   unsigned OriginalColumn;
 
   /// \brief The length of following tokens until the next natural split point,
diff --git a/lib/Format/TokenAnnotator.cpp b/lib/Format/TokenAnnotator.cpp
index e3cd2105e6..c0897ca68c 100644
--- a/lib/Format/TokenAnnotator.cpp
+++ b/lib/Format/TokenAnnotator.cpp
@@ -326,10 +326,10 @@ private:
                  Line.First->Type == TT_ObjCMethodSpecifier) {
         Tok->Type = TT_ObjCMethodExpr;
         Tok->Previous->Type = TT_ObjCSelectorName;
-        if (Tok->Previous->CodePointCount >
+        if (Tok->Previous->ColumnWidth >
             Contexts.back().LongestObjCSelectorName) {
           Contexts.back().LongestObjCSelectorName =
-              Tok->Previous->CodePointCount;
+              Tok->Previous->ColumnWidth;
         }
         if (Contexts.back().FirstObjCSelectorName == NULL)
           Contexts.back().FirstObjCSelectorName = Tok->Previous;
@@ -1022,7 +1022,8 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) {
 }
 
 void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) {
-  Line.First->TotalLength = Line.First->CodePointCount;
+  Line.First->TotalLength =
+      Line.First->IsMultiline ? Style.ColumnLimit : Line.First->ColumnWidth;
   if (!Line.First->Next)
     return;
   FormatToken *Current = Line.First->Next;
@@ -1055,11 +1056,11 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) {
     Current->CanBreakBefore =
         Current->MustBreakBefore || canBreakBefore(Line, *Current);
     if (Current->MustBreakBefore || !Current->Children.empty() ||
-        (Current->is(tok::string_literal) && Current->isMultiline()))
+        Current->IsMultiline)
       Current->TotalLength = Current->Previous->TotalLength + Style.ColumnLimit;
     else
       Current->TotalLength = Current->Previous->TotalLength +
-                             Current->CodePointCount +
+                             Current->ColumnWidth +
                              Current->SpacesRequiredBefore;
     // FIXME: Only calculate this if CanBreakBefore is true once static
     // initializers etc. are sorted out.
@@ -1095,7 +1096,7 @@ void TokenAnnotator::calculateUnbreakableTailLengths(AnnotatedLine &Line) {
       UnbreakableTailLength = 0;
     } else {
       UnbreakableTailLength +=
-          Current->CodePointCount + Current->SpacesRequiredBefore;
+          Current->ColumnWidth + Current->SpacesRequiredBefore;
     }
     Current = Current->Previous;
   }
diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp
index ef7d63c1eb..8ae956e4fd 100644
--- a/unittests/Format/FormatTest.cpp
+++ b/unittests/Format/FormatTest.cpp
@@ -5738,33 +5738,48 @@ TEST_F(FormatTest, ConfigurableUseOfTab) {
   // FIXME: To correctly count mixed whitespace we need to
   // also correctly count mixed whitespace in front of the comment.
 
-  // Tab.TabWidth = 8;
-  // Tab.IndentWidth = 8;
-  // EXPECT_EQ("/*\n"
-  //           "\t      a\t\tcomment\n"
-  //           "\t      in multiple lines\n"
-  //           "       */",
-  //           format("   /*\t \t \n"
-  //                  " \t \t a\t\tcomment\t \t\n"
-  //                  " \t \t in multiple lines\t\n"
-  //                  " \t  */",
-  //                  Tab));
-  // Tab.UseTab = false;
-  // EXPECT_EQ("/*\n"
-  //           "              a\t\tcomment\n"
-  //           "              in multiple lines\n"
-  //           "       */",
-  //           format("   /*\t \t \n"
-  //                  " \t \t a\t\tcomment\t \t\n"
-  //                  " \t \t in multiple lines\t\n"
-  //                  " \t  */",
-  //                  Tab));
-  // EXPECT_EQ("/* some\n"
-  //           "   comment */",
-  //          format(" \t \t /* some\n"
-  //                 " \t \t    comment */",
-  //                 Tab));
-
+  Tab.TabWidth = 8;
+  Tab.IndentWidth = 8;
+  EXPECT_EQ("/*\n"
+            "\t      a\t\tcomment\n"
+            "\t      in multiple lines\n"
+            "       */",
+            format("   /*\t \t \n"
+                   " \t \t a\t\tcomment\t \t\n"
+                   " \t \t in multiple lines\t\n"
+                   " \t  */",
+                   Tab));
+  Tab.UseTab = false;
+  EXPECT_EQ("/*\n"
+            "              a\t\tcomment\n"
+            "              in multiple lines\n"
+            "       */",
+            format("   /*\t \t \n"
+                   " \t \t a\t\tcomment\t \t\n"
+                   " \t \t in multiple lines\t\n"
+                   " \t  */",
+                   Tab));
+  EXPECT_EQ("/* some\n"
+            "   comment */",
+           format(" \t \t /* some\n"
+                  " \t \t    comment */",
+                  Tab));
+  EXPECT_EQ("int a; /* some\n"
+            "   comment */",
+           format(" \t \t int a; /* some\n"
+                  " \t \t    comment */",
+                  Tab));
+
+  EXPECT_EQ("int a; /* some\n"
+            "comment */",
+           format(" \t \t int\ta; /* some\n"
+                  " \t \t    comment */",
+                  Tab));
+  EXPECT_EQ("f(\"\t\t\"); /* some\n"
+            "    comment */",
+           format(" \t \t f(\"\t\t\"); /* some\n"
+                  " \t \t    comment */",
+                  Tab));
   EXPECT_EQ("{\n"
             "  /*\n"
             "   * Comment\n"
-- 
2.40.0