UTF-8 support for clang-format.

author Alexander Kornienko <alexfh@google.com>

Wed, 5 Jun 2013 14:09:10 +0000 (14:09 +0000)

committer Alexander Kornienko <alexfh@google.com>

Wed, 5 Jun 2013 14:09:10 +0000 (14:09 +0000)
author Alexander Kornienko <alexfh@google.com>
Wed, 5 Jun 2013 14:09:10 +0000 (14:09 +0000)
committer Alexander Kornienko <alexfh@google.com>
Wed, 5 Jun 2013 14:09:10 +0000 (14:09 +0000)
diff --git a/lib/Format/BreakableToken.cpp b/lib/Format/BreakableToken.cpp

index 7d7fe3f032eebfb33fc7c89d5d2ba770c4b660ac..5e5604c597f9d3adfb7b8cecd82af4c13c9f6d15 100644 (file)
--- a/lib/Format/BreakableToken.cpp
+++ b/lib/Format/BreakableToken.cpp
@@ -25,66 +25,22 @@ namespace clang {
  namespace format {
  namespace {
  
-// FIXME: Move helper string functions to where it makes sense.
-
-unsigned getOctalLength(StringRef Text) {
-  unsigned I = 1;
-  while (I < Text.size() && I < 4 && (Text[I] >= '0' && Text[I] <= '7')) {
-    ++I;
-  }
-  return I;
-}
-
-unsigned getHexLength(StringRef Text) {
-  unsigned I = 2; // Point after '\x'.
-  while (I < Text.size() && ((Text[I] >= '0' && Text[I] <= '9') ||
-                             (Text[I] >= 'a' && Text[I] <= 'f') ||
-                             (Text[I] >= 'A' && Text[I] <= 'F'))) {
-    ++I;
-  }
-  return I;
-}
-
-unsigned getEscapeSequenceLength(StringRef Text) {
-  assert(Text[0] == '\\');
-  if (Text.size() < 2)
-    return 1;
-
-  switch (Text[1]) {
-  case 'u':
-    return 6;
-  case 'U':
-    return 10;
-  case 'x':
-    return getHexLength(Text);
-  default:
-    if (Text[1] >= '0' && Text[1] <= '7')
-      return getOctalLength(Text);
-    return 2;
-  }
-}
-
-StringRef::size_type getStartOfCharacter(StringRef Text,
-                                         StringRef::size_type Offset) {
-  StringRef::size_type NextEscape = Text.find('\\');
-  while (NextEscape != StringRef::npos && NextEscape < Offset) {
-    StringRef::size_type SequenceLength =
-        getEscapeSequenceLength(Text.substr(NextEscape));
-    if (Offset < NextEscape + SequenceLength)
-      return NextEscape;
-    NextEscape = Text.find('\\', NextEscape + SequenceLength);
-  }
-  return Offset;
-}
-
  BreakableToken::Split getCommentSplit(StringRef Text,
                                        unsigned ContentStartColumn,
-                                      unsigned ColumnLimit) {
+                                      unsigned ColumnLimit,
+                                      encoding::Encoding Encoding) {
    if (ColumnLimit <= ContentStartColumn + 1)
      return BreakableToken::Split(StringRef::npos, 0);
  
    unsigned MaxSplit = ColumnLimit - ContentStartColumn + 1;
-  StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplit);
+  unsigned MaxSplitBytes = 0;
+
+  for (unsigned NumChars = 0;
+       NumChars < MaxSplit && MaxSplitBytes < Text.size(); ++NumChars)
+    MaxSplitBytes +=
+        encoding::getCodePointNumBytes(Text[MaxSplitBytes], Encoding);
+
+  StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplitBytes);
    if (SpaceOffset == StringRef::npos ||
        // Don't break at leading whitespace.
        Text.find_last_not_of(' ', SpaceOffset) == StringRef::npos) {
@@ -95,7 +51,7 @@ BreakableToken::Split getCommentSplit(StringRef Text,
        // If the comment is only whitespace, we cannot split.
        return BreakableToken::Split(StringRef::npos, 0);
      SpaceOffset =
-        Text.find(' ', std::max<unsigned>(MaxSplit, FirstNonWhitespace));
+        Text.find(' ', std::max<unsigned>(MaxSplitBytes, FirstNonWhitespace));
    }
    if (SpaceOffset != StringRef::npos && SpaceOffset != 0) {
      StringRef BeforeCut = Text.substr(0, SpaceOffset).rtrim();
@@ -108,25 +64,48 @@ BreakableToken::Split getCommentSplit(StringRef Text,
  
  BreakableToken::Split getStringSplit(StringRef Text,
                                       unsigned ContentStartColumn,
-                                     unsigned ColumnLimit) {
-
-  if (ColumnLimit <= ContentStartColumn)
-    return BreakableToken::Split(StringRef::npos, 0);
-  unsigned MaxSplit = ColumnLimit - ContentStartColumn;
+                                     unsigned ColumnLimit,
+                                     encoding::Encoding Encoding) {
    // FIXME: Reduce unit test case.
    if (Text.empty())
      return BreakableToken::Split(StringRef::npos, 0);
-  MaxSplit = std::min<unsigned>(MaxSplit, Text.size() - 1);
-  StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplit);
-  if (SpaceOffset != StringRef::npos && SpaceOffset != 0)
+  if (ColumnLimit <= ContentStartColumn)
+    return BreakableToken::Split(StringRef::npos, 0);
+  unsigned MaxSplit =
+      std::min<unsigned>(ColumnLimit - ContentStartColumn,
+                         encoding::getCodePointCount(Text, Encoding) - 1);
+  StringRef::size_type SpaceOffset = 0;
+  StringRef::size_type SlashOffset = 0;
+  StringRef::size_type SplitPoint = 0;
+  for (unsigned Chars = 0;;) {
+    unsigned Advance;
+    if (Text[0] == '\\') {
+      Advance = encoding::getEscapeSequenceLength(Text);
+      Chars += Advance;
+    } else {
+      Advance = encoding::getCodePointNumBytes(Text[0], Encoding);
+      Chars += 1;
+    }
+
+    if (Chars > MaxSplit)
+      break;
+
+    if (Text[0] == ' ')
+      SpaceOffset = SplitPoint;
+    if (Text[0] == '/')
+      SlashOffset = SplitPoint;
+
+    SplitPoint += Advance;
+    Text = Text.substr(Advance);
+  }
+
+  if (SpaceOffset != 0)
      return BreakableToken::Split(SpaceOffset + 1, 0);
-  StringRef::size_type SlashOffset = Text.rfind('/', MaxSplit);
-  if (SlashOffset != StringRef::npos && SlashOffset != 0)
+  if (SlashOffset != 0)
      return BreakableToken::Split(SlashOffset + 1, 0);
-  StringRef::size_type SplitPoint = getStartOfCharacter(Text, MaxSplit);
-  if (SplitPoint == StringRef::npos || SplitPoint == 0)
-    return BreakableToken::Split(StringRef::npos, 0);
-  return BreakableToken::Split(SplitPoint, 0);
+  if (SplitPoint != 0)
+    return BreakableToken::Split(SplitPoint, 0);
+  return BreakableToken::Split(StringRef::npos, 0);
  }
  
  } // namespace
@@ -136,8 +115,8 @@ unsigned BreakableSingleLineToken::getLineCount() const { return 1; }
  unsigned
  BreakableSingleLineToken::getLineLengthAfterSplit(unsigned LineIndex,
                                                    unsigned TailOffset) const {
-  return StartColumn + Prefix.size() + Postfix.size() + Line.size() -
-         TailOffset;
+  return StartColumn + Prefix.size() + Postfix.size() +
+         encoding::getCodePointCount(Line.substr(TailOffset), Encoding);
  }
  
  void BreakableSingleLineToken::insertBreak(unsigned LineIndex,
@@ -152,8 +131,9 @@ void BreakableSingleLineToken::insertBreak(unsigned LineIndex,
  BreakableSingleLineToken::BreakableSingleLineToken(const FormatToken &Tok,
                                                     unsigned StartColumn,
                                                     StringRef Prefix,
-                                                   StringRef Postfix)
-    : BreakableToken(Tok), StartColumn(StartColumn), Prefix(Prefix),
+                                                   StringRef Postfix,
+                                                   encoding::Encoding Encoding)
+    : BreakableToken(Tok, Encoding), StartColumn(StartColumn), Prefix(Prefix),
        Postfix(Postfix) {
    assert(Tok.TokenText.startswith(Prefix) && Tok.TokenText.endswith(Postfix));
    Line = Tok.TokenText.substr(
@@ -161,13 +141,15 @@ BreakableSingleLineToken::BreakableSingleLineToken(const FormatToken &Tok,
  }
  
  BreakableStringLiteral::BreakableStringLiteral(const FormatToken &Tok,
-                                               unsigned StartColumn)
-    : BreakableSingleLineToken(Tok, StartColumn, "\"", "\"") {}
+                                               unsigned StartColumn,
+                                               encoding::Encoding Encoding)
+    : BreakableSingleLineToken(Tok, StartColumn, "\"", "\"", Encoding) {}
  
  BreakableToken::Split
  BreakableStringLiteral::getSplit(unsigned LineIndex, unsigned TailOffset,
                                   unsigned ColumnLimit) const {
-  return getStringSplit(Line.substr(TailOffset), StartColumn + 2, ColumnLimit);
+  return getStringSplit(Line.substr(TailOffset), StartColumn + 2, ColumnLimit,
+                        Encoding);
  }
  
  static StringRef getLineCommentPrefix(StringRef Comment) {
@@ -179,23 +161,23 @@ static StringRef getLineCommentPrefix(StringRef Comment) {
  }
  
  BreakableLineComment::BreakableLineComment(const FormatToken &Token,
-                                           unsigned StartColumn)
+                                           unsigned StartColumn,
+                                           encoding::Encoding Encoding)
      : BreakableSingleLineToken(Token, StartColumn,
-                               getLineCommentPrefix(Token.TokenText), "") {}
+                               getLineCommentPrefix(Token.TokenText), "",
+                               Encoding) {}
  
  BreakableToken::Split
  BreakableLineComment::getSplit(unsigned LineIndex, unsigned TailOffset,
                                 unsigned ColumnLimit) const {
    return getCommentSplit(Line.substr(TailOffset), StartColumn + Prefix.size(),
-                         ColumnLimit);
+                         ColumnLimit, Encoding);
  }
  
-BreakableBlockComment::BreakableBlockComment(const FormatStyle &Style,
-                                             const FormatToken &Token,
-                                             unsigned StartColumn,
-                                             unsigned OriginalStartColumn,
-                                             bool FirstInLine)
-    : BreakableToken(Token) {
+BreakableBlockComment::BreakableBlockComment(
+    const FormatStyle &Style, const FormatToken &Token, unsigned StartColumn,
+    unsigned OriginalStartColumn, bool FirstInLine, encoding::Encoding Encoding)
+    : BreakableToken(Token, Encoding) {
    StringRef TokenText(Token.TokenText);
    assert(TokenText.startswith("/*") && TokenText.endswith("*/"));
    TokenText.substr(2, TokenText.size() - 4).split(Lines, "\n");
@@ -290,7 +272,8 @@ unsigned
  BreakableBlockComment::getLineLengthAfterSplit(unsigned LineIndex,
                                                 unsigned TailOffset) const {
    return getContentStartColumn(LineIndex, TailOffset) +
-         (Lines[LineIndex].size() - TailOffset) +
+         encoding::getCodePointCount(Lines[LineIndex].substr(TailOffset),
+                                     Encoding) +
           // The last line gets a "*/" postfix.
           (LineIndex + 1 == Lines.size() ? 2 : 0);
  }
@@ -300,7 +283,7 @@ BreakableBlockComment::getSplit(unsigned LineIndex, unsigned TailOffset,
                                  unsigned ColumnLimit) const {
    return getCommentSplit(Lines[LineIndex].substr(TailOffset),
                           getContentStartColumn(LineIndex, TailOffset),
-                         ColumnLimit);
+                         ColumnLimit, Encoding);
  }
  
  void BreakableBlockComment::insertBreak(unsigned LineIndex, unsigned TailOffset,
diff --git a/lib/Format/BreakableToken.h b/lib/Format/BreakableToken.h

index 03904c2a4680379546b9be5ed6ea669a3ad4ba47..157bff4c42fffb6f4cd476d25a6184920052178c 100644 (file)
--- a/lib/Format/BreakableToken.h
+++ b/lib/Format/BreakableToken.h
@@ -17,6 +17,7 @@
  #ifndef LLVM_CLANG_FORMAT_BREAKABLETOKEN_H
  #define LLVM_CLANG_FORMAT_BREAKABLETOKEN_H
  
+#include "Encoding.h"
  #include "TokenAnnotator.h"
  #include "WhitespaceManager.h"
  #include <utility>
@@ -65,9 +66,11 @@ public:
                                         WhitespaceManager &Whitespaces) {}
  
  protected:
-  BreakableToken(const FormatToken &Tok) : Tok(Tok) {}
+  BreakableToken(const FormatToken &Tok, encoding::Encoding Encoding)
+      : Tok(Tok), Encoding(Encoding) {}
  
    const FormatToken &Tok;
+  encoding::Encoding Encoding;
  };
  
  /// \brief Base class for single line tokens that can be broken.
@@ -83,7 +86,8 @@ public:
  
  protected:
    BreakableSingleLineToken(const FormatToken &Tok, unsigned StartColumn,
-                           StringRef Prefix, StringRef Postfix);
+                           StringRef Prefix, StringRef Postfix,
+                           encoding::Encoding Encoding);
  
    // The column in which the token starts.
    unsigned StartColumn;
@@ -101,7 +105,8 @@ public:
    ///
    /// \p StartColumn specifies the column in which the token will start
    /// after formatting.
-  BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn);
+  BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
+                         encoding::Encoding Encoding);
  
    virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
                           unsigned ColumnLimit) const;
@@ -113,7 +118,8 @@ public:
    ///
    /// \p StartColumn specifies the column in which the comment will start
    /// after formatting.
-  BreakableLineComment(const FormatToken &Token, unsigned StartColumn);
+  BreakableLineComment(const FormatToken &Token, unsigned StartColumn,
+                       encoding::Encoding Encoding);
  
    virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
                           unsigned ColumnLimit) const;
@@ -129,7 +135,7 @@ public:
    /// If the comment starts a line after formatting, set \p FirstInLine to true.
    BreakableBlockComment(const FormatStyle &Style, const FormatToken &Token,
                          unsigned StartColumn, unsigned OriginaStartColumn,
-                        bool FirstInLine);
+                        bool FirstInLine, encoding::Encoding Encoding);
  
    virtual unsigned getLineCount() const;
    virtual unsigned getLineLengthAfterSplit(unsigned LineIndex,
diff --git a/lib/Format/Encoding.h b/lib/Format/Encoding.h

new file mode 100644 (file)

index 0000000..a44f459
--- /dev/null
+++ b/lib/Format/Encoding.h
@@ -0,0 +1,114 @@
+//===--- Encoding.h - Format C++ code -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Contains functions for text encoding manipulation. Supports UTF-8,
+/// 8-bit encodings and escape sequences in C++ string literals.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_FORMAT_ENCODING_H
+#define LLVM_CLANG_FORMAT_ENCODING_H
+
+#include "clang/Basic/LLVM.h"
+#include "llvm/Support/ConvertUTF.h"
+
+namespace clang {
+namespace format {
+namespace encoding {
+
+enum Encoding {
+  Encoding_UTF8,
+  Encoding_Unknown // We treat all other encodings as 8-bit encodings.
+};
+
+/// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8,
+/// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
+inline Encoding detectEncoding(StringRef Text) {
+  const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin());
+  const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end());
+  if (::isLegalUTF8String(&Ptr, BufEnd))
+    return Encoding_UTF8;
+  return Encoding_Unknown;
+}
+
+inline unsigned getCodePointCountUTF8(StringRef Text) {
+  unsigned CodePoints = 0;
+  for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) {
+    ++CodePoints;
+  }
+  return CodePoints;
+}
+
+/// \brief Gets the number of code points in the Text using the specified
+/// Encoding.
+inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
+  switch (Encoding) {
+    case Encoding_UTF8:
+      return getCodePointCountUTF8(Text);
+    default:
+      return Text.size();
+  }
+}
+
+/// \brief Gets the number of bytes in a sequence representing a single
+/// codepoint and starting with FirstChar in the specified Encoding.
+inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
+  switch (Encoding) {
+    case Encoding_UTF8:
+      return getNumBytesForUTF8(FirstChar);
+    default:
+      return 1;
+  }
+}
+
+inline bool isOctDigit(char c) {
+  return '0' <= c && c <= '7';
+}
+
+inline bool isHexDigit(char c) {
+  return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
+         ('A' <= c && c <= 'F');
+}
+
+/// \brief Gets the length of an escape sequence inside a C++ string literal.
+/// Text should span from the beginning of the escape sequence (starting with a
+/// backslash) to the end of the string literal.
+inline unsigned getEscapeSequenceLength(StringRef Text) {
+  assert(Text[0] == '\\');
+  if (Text.size() < 2)
+    return 1;
+
+  switch (Text[1]) {
+  case 'u':
+    return 6;
+  case 'U':
+    return 10;
+  case 'x': {
+    unsigned I = 2; // Point after '\x'.
+    while (I < Text.size() && isHexDigit(Text[I]))
+      ++I;
+    return I;
+  }
+  default:
+    if (isOctDigit(Text[1])) {
+      unsigned I = 1;
+      while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
+        ++I;
+      return I;
+    }
+    return 2;
+  }
+}
+
+} // namespace encoding
+} // namespace format
+} // namespace clang
+
+#endif // LLVM_CLANG_FORMAT_ENCODING_H
diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp

index 63bf09317e3ca23c56fe491a60d8f411cb15dd92..9dd5e4a0f214e25a53202696bf15189a8b22df74 100644 (file)
--- a/lib/Format/Format.cpp
+++ b/lib/Format/Format.cpp
@@ -243,10 +243,11 @@ public:
    UnwrappedLineFormatter(const FormatStyle &Style, SourceManager &SourceMgr,
                           const AnnotatedLine &Line, unsigned FirstIndent,
                           const FormatToken *RootToken,
-                         WhitespaceManager &Whitespaces)
+                         WhitespaceManager &Whitespaces,
+                         encoding::Encoding Encoding)
        : Style(Style), SourceMgr(SourceMgr), Line(Line),
          FirstIndent(FirstIndent), RootToken(RootToken),
-        Whitespaces(Whitespaces), Count(0) {}
+        Whitespaces(Whitespaces), Count(0), Encoding(Encoding) {}
  
    /// \brief Formats an \c UnwrappedLine.
    void format(const AnnotatedLine *NextLine) {
@@ -484,7 +485,7 @@ private:
                                   State.NextToken->WhitespaceRange.getEnd()) -
                               SourceMgr.getSpellingColumnNumber(
                                   State.NextToken->WhitespaceRange.getBegin());
-      State.Column += WhitespaceLength + State.NextToken->TokenLength;
+      State.Column += WhitespaceLength + State.NextToken->CodePointCount;
        State.NextToken = State.NextToken->Next;
        return 0;
      }
@@ -520,11 +521,11 @@ private:
                    Line.StartsDefinition)) {
          State.Column = State.Stack.back().Indent;
        } else if (Current.Type == TT_ObjCSelectorName) {
-        if (State.Stack.back().ColonPos > Current.TokenLength) {
-          State.Column = State.Stack.back().ColonPos - Current.TokenLength;
+        if (State.Stack.back().ColonPos > Current.CodePointCount) {
+          State.Column = State.Stack.back().ColonPos - Current.CodePointCount;
          } else {
            State.Column = State.Stack.back().Indent;
-          State.Stack.back().ColonPos = State.Column + Current.TokenLength;
+          State.Stack.back().ColonPos = State.Column + Current.CodePointCount;
          }
        } else if (Current.Type == TT_StartOfName ||
                   Previous.isOneOf(tok::coloncolon, tok::equal) ||
@@ -560,7 +561,7 @@ private:
        State.Stack.back().LastSpace = State.Column;
        if (Current.isOneOf(tok::arrow, tok::period) &&
            Current.Type != TT_DesignatedInitializerPeriod)
-        State.Stack.back().LastSpace += Current.TokenLength;
+        State.Stack.back().LastSpace += Current.CodePointCount;
        State.StartOfLineLevel = State.ParenLevel;
        State.LowestCallLevel = State.ParenLevel;
  
@@ -595,8 +596,8 @@ private:
          State.Stack.back().VariablePos = State.Column;
          // Move over * and & if they are bound to the variable name.
          const FormatToken *Tok = &Previous;
-        while (Tok && State.Stack.back().VariablePos >= Tok->TokenLength) {
-          State.Stack.back().VariablePos -= Tok->TokenLength;
+        while (Tok && State.Stack.back().VariablePos >= Tok->CodePointCount) {
+          State.Stack.back().VariablePos -= Tok->CodePointCount;
            if (Tok->SpacesRequiredBefore != 0)
              break;
            Tok = Tok->Previous;
@@ -614,12 +615,12 @@ private:
        if (Current.Type == TT_ObjCSelectorName &&
            State.Stack.back().ColonPos == 0) {
          if (State.Stack.back().Indent + Current.LongestObjCSelectorName >
-            State.Column + Spaces + Current.TokenLength)
+            State.Column + Spaces + Current.CodePointCount)
            State.Stack.back().ColonPos =
                State.Stack.back().Indent + Current.LongestObjCSelectorName;
          else
            State.Stack.back().ColonPos =
-              State.Column + Spaces + Current.TokenLength;
+              State.Column + Spaces + Current.CodePointCount;
        }
  
        if (Previous.opensScope() && Previous.Type != TT_ObjCMethodExpr &&
@@ -671,7 +672,8 @@ private:
        State.LowestCallLevel = std::min(State.LowestCallLevel, State.ParenLevel);
        if (Line.Type == LT_BuilderTypeCall && State.ParenLevel == 0)
          State.Stack.back().StartOfFunctionCall =
-            Current.LastInChainOfCalls ? 0 : State.Column + Current.TokenLength;
+            Current.LastInChainOfCalls ? 0
+                                       : State.Column + Current.CodePointCount;
      }
      if (Current.Type == TT_CtorInitializerColon) {
        // Indent 2 from the column, so:
@@ -779,7 +781,7 @@ private:
        State.StartOfStringLiteral = 0;
      }
  
-    State.Column += Current.TokenLength;
+    State.Column += Current.CodePointCount;
  
      State.NextToken = State.NextToken->Next;
  
@@ -798,7 +800,7 @@ private:
                                  bool DryRun) {
      unsigned UnbreakableTailLength = Current.UnbreakableTailLength;
      llvm::OwningPtr<BreakableToken> Token;
-    unsigned StartColumn = State.Column - Current.TokenLength;
+    unsigned StartColumn = State.Column - Current.CodePointCount;
      unsigned OriginalStartColumn =
          SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) -
          1;
@@ -811,15 +813,16 @@ private:
        if (!LiteralData || *LiteralData != '"')
          return 0;
  
-      Token.reset(new BreakableStringLiteral(Current, StartColumn));
+      Token.reset(new BreakableStringLiteral(Current, StartColumn, Encoding));
      } else if (Current.Type == TT_BlockComment) {
        BreakableBlockComment *BBC = new BreakableBlockComment(
-          Style, Current, StartColumn, OriginalStartColumn, !Current.Previous);
+          Style, Current, StartColumn, OriginalStartColumn, !Current.Previous,
+          Encoding);
        Token.reset(BBC);
      } else if (Current.Type == TT_LineComment &&
                 (Current.Previous == NULL ||
                  Current.Previous->Type != TT_ImplicitStringLiteral)) {
-      Token.reset(new BreakableLineComment(Current, StartColumn));
+      Token.reset(new BreakableLineComment(Current, StartColumn, Encoding));
      } else {
        return 0;
      }
@@ -837,27 +840,27 @@ private:
                                         Whitespaces);
        }
        unsigned TailOffset = 0;
-      unsigned RemainingTokenLength =
+      unsigned RemainingTokenColumns =
            Token->getLineLengthAfterSplit(LineIndex, TailOffset);
-      while (RemainingTokenLength > RemainingSpace) {
+      while (RemainingTokenColumns > RemainingSpace) {
          BreakableToken::Split Split =
              Token->getSplit(LineIndex, TailOffset, getColumnLimit());
          if (Split.first == StringRef::npos)
            break;
          assert(Split.first != 0);
-        unsigned NewRemainingTokenLength = Token->getLineLengthAfterSplit(
+        unsigned NewRemainingTokenColumns = Token->getLineLengthAfterSplit(
              LineIndex, TailOffset + Split.first + Split.second);
-        assert(NewRemainingTokenLength < RemainingTokenLength);
+        assert(NewRemainingTokenColumns < RemainingTokenColumns);
          if (!DryRun) {
            Token->insertBreak(LineIndex, TailOffset, Split, Line.InPPDirective,
                               Whitespaces);
          }
          TailOffset += Split.first + Split.second;
-        RemainingTokenLength = NewRemainingTokenLength;
+        RemainingTokenColumns = NewRemainingTokenColumns;
          Penalty += Style.PenaltyExcessCharacter;
          BreakInserted = true;
        }
-      PositionAfterLastLineInToken = RemainingTokenLength;
+      PositionAfterLastLineInToken = RemainingTokenColumns;
      }
  
      if (BreakInserted) {
@@ -1080,13 +1083,16 @@ private:
    // Increasing count of \c StateNode items we have created. This is used
    // to create a deterministic order independent of the container.
    unsigned Count;
+  encoding::Encoding Encoding;
  };
  
  class FormatTokenLexer {
  public:
-  FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr)
+  FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr,
+                   encoding::Encoding Encoding)
        : FormatTok(NULL), GreaterStashed(false), TrailingWhitespace(0), Lex(Lex),
-        SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()) {
+        SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()),
+        Encoding(Encoding) {
      Lex.SetKeepWhitespaceMode(true);
    }
  
@@ -1111,7 +1117,8 @@ private:
            FormatTok->Tok.getLocation().getLocWithOffset(1);
        FormatTok->WhitespaceRange =
            SourceRange(GreaterLocation, GreaterLocation);
-      FormatTok->TokenLength = 1;
+      FormatTok->ByteCount = 1;
+      FormatTok->CodePointCount = 1;
        GreaterStashed = false;
        return FormatTok;
      }
@@ -1146,12 +1153,12 @@ private:
      }
  
      // Now FormatTok is the next non-whitespace token.
-    FormatTok->TokenLength = Text.size();
+    FormatTok->ByteCount = Text.size();
  
      TrailingWhitespace = 0;
      if (FormatTok->Tok.is(tok::comment)) {
        TrailingWhitespace = Text.size() - Text.rtrim().size();
-      FormatTok->TokenLength -= TrailingWhitespace;
+      FormatTok->ByteCount -= TrailingWhitespace;
      }
  
      // In case the token starts with escaped newlines, we want to
@@ -1164,7 +1171,7 @@ private:
      while (i + 1 < Text.size() && Text[i] == '\\' && Text[i + 1] == '\n') {
        // FIXME: ++FormatTok->NewlinesBefore is missing...
        WhitespaceLength += 2;
-      FormatTok->TokenLength -= 2;
+      FormatTok->ByteCount -= 2;
        i += 2;
      }
  
@@ -1176,15 +1183,19 @@ private:
  
      if (FormatTok->Tok.is(tok::greatergreater)) {
        FormatTok->Tok.setKind(tok::greater);
-      FormatTok->TokenLength = 1;
+      FormatTok->ByteCount = 1;
        GreaterStashed = true;
      }
  
+    unsigned EncodingExtraBytes =
+        Text.size() - encoding::getCodePointCount(Text, Encoding);
+    FormatTok->CodePointCount = FormatTok->ByteCount - EncodingExtraBytes;
+
      FormatTok->WhitespaceRange = SourceRange(
          WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
      FormatTok->TokenText = StringRef(
          SourceMgr.getCharacterData(FormatTok->getStartOfNonWhitespace()),
-        FormatTok->TokenLength);
+        FormatTok->ByteCount);
      return FormatTok;
    }
  
@@ -1194,6 +1205,7 @@ private:
    Lexer &Lex;
    SourceManager &SourceMgr;
    IdentifierTable IdentTable;
+  encoding::Encoding Encoding;
    llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
    SmallVector<FormatToken *, 16> Tokens;
  
@@ -1209,17 +1221,22 @@ public:
    Formatter(const FormatStyle &Style, Lexer &Lex, SourceManager &SourceMgr,
              const std::vector<CharSourceRange> &Ranges)
        : Style(Style), Lex(Lex), SourceMgr(SourceMgr),
-        Whitespaces(SourceMgr, Style), Ranges(Ranges) {}
+        Whitespaces(SourceMgr, Style), Ranges(Ranges),
+        Encoding(encoding::detectEncoding(Lex.getBuffer())) {
+    DEBUG(llvm::dbgs()
+          << "File encoding: "
+          << (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown")
+          << "\n");
+  }
  
    virtual ~Formatter() {}
  
    tooling::Replacements format() {
-    FormatTokenLexer Tokens(Lex, SourceMgr);
+    FormatTokenLexer Tokens(Lex, SourceMgr, Encoding);
  
      UnwrappedLineParser Parser(Style, Tokens.lex(), *this);
      bool StructuralError = Parser.parse();
-    TokenAnnotator Annotator(Style, SourceMgr, Lex,
-                             Tokens.getIdentTable().get("in"));
+    TokenAnnotator Annotator(Style, Tokens.getIdentTable().get("in"));
      for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) {
        Annotator.annotate(AnnotatedLines[i]);
      }
@@ -1290,7 +1307,7 @@ public:
                1;
          }
          UnwrappedLineFormatter Formatter(Style, SourceMgr, TheLine, Indent,
-                                         TheLine.First, Whitespaces);
+                                         TheLine.First, Whitespaces, Encoding);
          Formatter.format(I + 1 != E ? &*(I + 1) : NULL);
          IndentForLevel[TheLine.Level] = LevelIndent;
          PreviousLineWasTouched = true;
@@ -1556,7 +1573,7 @@ private:
      CharSourceRange LineRange = CharSourceRange::getCharRange(
          First->WhitespaceRange.getBegin().getLocWithOffset(
              First->LastNewlineOffset),
-        Last->Tok.getLocation().getLocWithOffset(Last->TokenLength - 1));
+        Last->Tok.getLocation().getLocWithOffset(Last->ByteCount - 1));
      return touchesRanges(LineRange);
    }
  
@@ -1616,6 +1633,8 @@ private:
    WhitespaceManager Whitespaces;
    std::vector<CharSourceRange> Ranges;
    std::vector<AnnotatedLine> AnnotatedLines;
+
+  encoding::Encoding Encoding;
  };
  
  tooling::Replacements reformat(const FormatStyle &Style, Lexer &Lex,
diff --git a/lib/Format/FormatToken.h b/lib/Format/FormatToken.h

index 4a5e20dd4c6cc251b5382ee3ae3612cd5000929a..fd1bd7e1cf863f1bf5c522766704db26cf31d075 100644 (file)
--- a/lib/Format/FormatToken.h
+++ b/lib/Format/FormatToken.h
@@ -61,11 +61,12 @@ enum TokenType {
  struct FormatToken {
    FormatToken()
        : NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0),
-        TokenLength(0), IsFirst(false), MustBreakBefore(false),
-        Type(TT_Unknown), SpacesRequiredBefore(0), CanBreakBefore(false),
-        ClosesTemplateDeclaration(false), ParameterCount(0), TotalLength(0),
-        UnbreakableTailLength(0), BindingStrength(0), SplitPenalty(0),
-        LongestObjCSelectorName(0), FakeRParens(0), LastInChainOfCalls(false),
+        ByteCount(0), CodePointCount(0), IsFirst(false),
+        MustBreakBefore(false), Type(TT_Unknown), SpacesRequiredBefore(0),
+        CanBreakBefore(false), ClosesTemplateDeclaration(false),
+        ParameterCount(0), TotalLength(0), UnbreakableTailLength(0),
+        BindingStrength(0), SplitPenalty(0), LongestObjCSelectorName(0),
+        FakeRParens(0), LastInChainOfCalls(false),
          PartOfMultiVariableDeclStmt(false), MatchingParen(NULL), Previous(NULL),
          Next(NULL) {}
  
@@ -89,10 +90,14 @@ struct FormatToken {
    /// whitespace (relative to \c WhiteSpaceStart). 0 if there is no '\n'.
    unsigned LastNewlineOffset;
  
-  /// \brief The length of the non-whitespace parts of the token. This is
-  /// necessary because we need to handle escaped newlines that are stored
+  /// \brief The number of bytes of the non-whitespace parts of the token. This
+  /// is necessary because we need to handle escaped newlines that are stored
    /// with the token.
-  unsigned TokenLength;
+  unsigned ByteCount;
+
+  /// \brief The length of the non-whitespace parts of the token in CodePoints.
+  /// We need this to correctly measure number of columns a token spans.
+  unsigned CodePointCount;
  
    /// \brief Indicates that this is the first token.
    bool IsFirst;
diff --git a/lib/Format/TokenAnnotator.cpp b/lib/Format/TokenAnnotator.cpp

index 83dea841b5ec7c5041431513c6e735ebe26182a7..62177b3efd769c22e86c6212f7566d9c833e1d12 100644 (file)
--- a/lib/Format/TokenAnnotator.cpp
+++ b/lib/Format/TokenAnnotator.cpp
@@ -15,7 +15,6 @@
  
  #include "TokenAnnotator.h"
  #include "clang/Basic/SourceManager.h"
-#include "clang/Lex/Lexer.h"
  #include "llvm/Support/Debug.h"
  
  namespace clang {
@@ -28,10 +27,9 @@ namespace format {
  /// into template parameter lists.
  class AnnotatingParser {
  public:
-  AnnotatingParser(SourceManager &SourceMgr, Lexer &Lex, AnnotatedLine &Line,
-                   IdentifierInfo &Ident_in)
-      : SourceMgr(SourceMgr), Lex(Lex), Line(Line), CurrentToken(Line.First),
-        KeywordVirtualFound(false), NameFound(false), Ident_in(Ident_in) {
+  AnnotatingParser(AnnotatedLine &Line, IdentifierInfo &Ident_in)
+      : Line(Line), CurrentToken(Line.First), KeywordVirtualFound(false),
+        NameFound(false), Ident_in(Ident_in) {
      Contexts.push_back(Context(tok::unknown, 1, /*IsExpression=*/ false));
    }
  
@@ -295,9 +293,11 @@ private:
                   Line.First->Type == TT_ObjCMethodSpecifier) {
          Tok->Type = TT_ObjCMethodExpr;
          Tok->Previous->Type = TT_ObjCSelectorName;
-        if (Tok->Previous->TokenLength >
-            Contexts.back().LongestObjCSelectorName)
-          Contexts.back().LongestObjCSelectorName = Tok->Previous->TokenLength;
+        if (Tok->Previous->CodePointCount >
+            Contexts.back().LongestObjCSelectorName) {
+          Contexts.back().LongestObjCSelectorName =
+              Tok->Previous->CodePointCount;
+        }
          if (Contexts.back().FirstObjCSelectorName == NULL)
            Contexts.back().FirstObjCSelectorName = Tok->Previous;
        } else if (Contexts.back().ColonIsForRangeExpr) {
@@ -602,9 +602,7 @@ private:
        } else if (Current.isBinaryOperator()) {
          Current.Type = TT_BinaryOperator;
        } else if (Current.is(tok::comment)) {
-        std::string Data(
-            Lexer::getSpelling(Current.Tok, SourceMgr, Lex.getLangOpts()));
-        if (StringRef(Data).startswith("//"))
+        if (Current.TokenText.startswith("//"))
            Current.Type = TT_LineComment;
          else
            Current.Type = TT_BlockComment;
@@ -748,23 +746,19 @@ private:
      case tok::kw_wchar_t:
      case tok::kw_bool:
      case tok::kw___underlying_type:
-      return true;
      case tok::annot_typename:
      case tok::kw_char16_t:
      case tok::kw_char32_t:
      case tok::kw_typeof:
      case tok::kw_decltype:
-      return Lex.getLangOpts().CPlusPlus;
+      return true;
      default:
-      break;
+      return false;
      }
-    return false;
    }
  
    SmallVector<Context, 8> Contexts;
  
-  SourceManager &SourceMgr;
-  Lexer &Lex;
    AnnotatedLine &Line;
    FormatToken *CurrentToken;
    bool KeywordVirtualFound;
@@ -866,7 +860,7 @@ private:
  };
  
  void TokenAnnotator::annotate(AnnotatedLine &Line) {
-  AnnotatingParser Parser(SourceMgr, Lex, Line, Ident_in);
+  AnnotatingParser Parser(Line, Ident_in);
    Line.Type = Parser.parseLine();
    if (Line.Type == LT_Invalid)
      return;
@@ -886,7 +880,7 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) {
  }
  
  void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) {
-  Line.First->TotalLength = Line.First->TokenLength;
+  Line.First->TotalLength = Line.First->CodePointCount;
    if (!Line.First->Next)
      return;
    FormatToken *Current = Line.First->Next;
@@ -920,7 +914,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) {
        Current->TotalLength = Current->Previous->TotalLength + Style.ColumnLimit;
      else
        Current->TotalLength =
-          Current->Previous->TotalLength + Current->TokenLength +
+          Current->Previous->TotalLength + Current->CodePointCount +
            Current->SpacesRequiredBefore;
      // FIXME: Only calculate this if CanBreakBefore is true once static
      // initializers etc. are sorted out.
@@ -947,7 +941,7 @@ void TokenAnnotator::calculateUnbreakableTailLengths(AnnotatedLine &Line) {
        UnbreakableTailLength = 0;
      } else {
        UnbreakableTailLength +=
-          Current->TokenLength + Current->SpacesRequiredBefore;
+          Current->CodePointCount + Current->SpacesRequiredBefore;
      }
      Current = Current->Previous;
    }
@@ -1015,8 +1009,7 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
  
    if (Right.is(tok::lessless)) {
      if (Left.is(tok::string_literal)) {
-      StringRef Content =
-          StringRef(Left.Tok.getLiteralData(), Left.TokenLength);
+      StringRef Content = Left.TokenText;
        Content = Content.drop_back(1).drop_front(1).trim();
        if (Content.size() > 1 &&
            (Content.back() == ':' || Content.back() == '='))
diff --git a/lib/Format/TokenAnnotator.h b/lib/Format/TokenAnnotator.h

index a0d680c91dfe28dbdfbda5a1fed4e04036ad8b21..28d55a007c2080491ca843b0f57411ebfea1fb73 100644 (file)
--- a/lib/Format/TokenAnnotator.h
+++ b/lib/Format/TokenAnnotator.h
@@ -21,7 +21,6 @@
  #include <string>
  
  namespace clang {
-class Lexer;
  class SourceManager;
  
  namespace format {
@@ -71,10 +70,8 @@ public:
  /// \c UnwrappedLine.
  class TokenAnnotator {
  public:
-  TokenAnnotator(const FormatStyle &Style, SourceManager &SourceMgr, Lexer &Lex,
-                 IdentifierInfo &Ident_in)
-      : Style(Style), SourceMgr(SourceMgr), Lex(Lex), Ident_in(Ident_in) {
-  }
+  TokenAnnotator(const FormatStyle &Style, IdentifierInfo &Ident_in)
+      : Style(Style), Ident_in(Ident_in) {}
  
    void annotate(AnnotatedLine &Line);
    void calculateFormattingInformation(AnnotatedLine &Line);
@@ -95,8 +92,6 @@ private:
    void calculateUnbreakableTailLengths(AnnotatedLine &Line);
  
    const FormatStyle &Style;
-  SourceManager &SourceMgr;
-  Lexer &Lex;
  
    // Contextual keywords:
    IdentifierInfo &Ident_in;
diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp

index 80072b5ab57c6af618a26c9eadf34601ad072a62..a959bcbe55c7e5066b38175caf250e02d84ec2cc 100644 (file)
--- a/unittests/Format/FormatTest.cpp
+++ b/unittests/Format/FormatTest.cpp
@@ -4873,5 +4873,80 @@ TEST_F(FormatTest, ConfigurationRoundTripTest) {
    EXPECT_EQ(Style, ParsedStyle);
  }
  
+TEST_F(FormatTest, WorksFor8bitEncodings) {
+  EXPECT_EQ("\"\xce\xe4\xed\xe0\xe6\xe4\xfb \xe2 \"\n"
+            "\"\xf1\xf2\xf3\xe4\xb8\xed\xf3\xfe \"\n"
+            "\"\xe7\xe8\xec\xed\xfe\xfe \"\n"
+            "\"\xef\xee\xf0\xf3...\"",
+            format("\"\xce\xe4\xed\xe0\xe6\xe4\xfb \xe2 "
+                   "\xf1\xf2\xf3\xe4\xb8\xed\xf3\xfe \xe7\xe8\xec\xed\xfe\xfe "
+                   "\xef\xee\xf0\xf3...\"",
+                   getLLVMStyleWithColumns(12)));
+}
+
+TEST_F(FormatTest, CountsUTF8CharactersProperly) {
+  verifyFormat("\"Однажды в студёную зимнюю пору...\"",
+               getLLVMStyleWithColumns(35));
+  verifyFormat("\"一 二 三 四 五 六 七 八 九 十\"",
+               getLLVMStyleWithColumns(21));
+  verifyFormat("// Однажды в студёную зимнюю пору...",
+               getLLVMStyleWithColumns(36));
+  verifyFormat("// 一 二 三 四 五 六 七 八 九 十",
+               getLLVMStyleWithColumns(22));
+  verifyFormat("/* Однажды в студёную зимнюю пору... */",
+               getLLVMStyleWithColumns(39));
+  verifyFormat("/* 一 二 三 四 五 六 七 八 九 十 */",
+               getLLVMStyleWithColumns(25));
+}
+
+TEST_F(FormatTest, SplitsUTF8Strings) {
+  EXPECT_EQ(
+      "\"Однажды, в \"\n"
+      "\"студёную \"\n"
+      "\"зимнюю \"\n"
+      "\"пору,\"",
+      format("\"Однажды, в студёную зимнюю пору,\"",
+             getLLVMStyleWithColumns(13)));
+  EXPECT_EQ("\"一 二 三 四 \"\n"
+            "\"五 六 七 八 \"\n"
+            "\"九 十\"",
+            format("\"一 二 三 四 五 六 七 八 九 十\"",
+                   getLLVMStyleWithColumns(10)));
+}
+
+TEST_F(FormatTest, SplitsUTF8LineComments) {
+  EXPECT_EQ("// Я из лесу\n"
+            "// вышел; был\n"
+            "// сильный\n"
+            "// мороз.",
+            format("// Я из лесу вышел; был сильный мороз.",
+                   getLLVMStyleWithColumns(13)));
+  EXPECT_EQ("// 一二三\n"
+            "// 四五六七\n"
+            "// 八\n"
+            "// 九 十",
+            format("// 一二三 四五六七 八  九 十", getLLVMStyleWithColumns(6)));
+}
+
+TEST_F(FormatTest, SplitsUTF8BlockComments) {
+  EXPECT_EQ("/* Гляжу,\n"
+            " * поднимается\n"
+            " * медленно в\n"
+            " * гору\n"
+            " * Лошадка,\n"
+            " * везущая\n"
+            " * хворосту\n"
+            " * воз. */",
+            format("/* Гляжу, поднимается медленно в гору\n"
+                   " * Лошадка, везущая хворосту воз. */",
+                   getLLVMStyleWithColumns(13)));
+  EXPECT_EQ("/* 一二三\n"
+            " * 四五六七\n"
+            " * 八\n"
+            " * 九 十\n"
+            " */",
+            format("/* 一二三 四五六七 八  九 十 */", getLLVMStyleWithColumns(6)));
+}
+
  } // end namespace tooling
  } // end namespace clang
author	Alexander Kornienko <alexfh@google.com>
	Wed, 5 Jun 2013 14:09:10 +0000 (14:09 +0000)
committer	Alexander Kornienko <alexfh@google.com>
	Wed, 5 Jun 2013 14:09:10 +0000 (14:09 +0000)
lib/Format/BreakableToken.cpp		patch \| blob \| history
lib/Format/BreakableToken.h		patch \| blob \| history
lib/Format/Encoding.h	[new file with mode: 0644]	patch \| blob
lib/Format/Format.cpp		patch \| blob \| history
lib/Format/FormatToken.h		patch \| blob \| history
lib/Format/TokenAnnotator.cpp		patch \| blob \| history
lib/Format/TokenAnnotator.h		patch \| blob \| history
unittests/Format/FormatTest.cpp		patch \| blob \| history