namespace format {
namespace {
-// FIXME: Move helper string functions to where it makes sense.
-
-unsigned getOctalLength(StringRef Text) {
- unsigned I = 1;
- while (I < Text.size() && I < 4 && (Text[I] >= '0' && Text[I] <= '7')) {
- ++I;
- }
- return I;
-}
-
-unsigned getHexLength(StringRef Text) {
- unsigned I = 2; // Point after '\x'.
- while (I < Text.size() && ((Text[I] >= '0' && Text[I] <= '9') ||
- (Text[I] >= 'a' && Text[I] <= 'f') ||
- (Text[I] >= 'A' && Text[I] <= 'F'))) {
- ++I;
- }
- return I;
-}
-
-unsigned getEscapeSequenceLength(StringRef Text) {
- assert(Text[0] == '\\');
- if (Text.size() < 2)
- return 1;
-
- switch (Text[1]) {
- case 'u':
- return 6;
- case 'U':
- return 10;
- case 'x':
- return getHexLength(Text);
- default:
- if (Text[1] >= '0' && Text[1] <= '7')
- return getOctalLength(Text);
- return 2;
- }
-}
-
-StringRef::size_type getStartOfCharacter(StringRef Text,
- StringRef::size_type Offset) {
- StringRef::size_type NextEscape = Text.find('\\');
- while (NextEscape != StringRef::npos && NextEscape < Offset) {
- StringRef::size_type SequenceLength =
- getEscapeSequenceLength(Text.substr(NextEscape));
- if (Offset < NextEscape + SequenceLength)
- return NextEscape;
- NextEscape = Text.find('\\', NextEscape + SequenceLength);
- }
- return Offset;
-}
-
BreakableToken::Split getCommentSplit(StringRef Text,
unsigned ContentStartColumn,
- unsigned ColumnLimit) {
+ unsigned ColumnLimit,
+ encoding::Encoding Encoding) {
if (ColumnLimit <= ContentStartColumn + 1)
return BreakableToken::Split(StringRef::npos, 0);
unsigned MaxSplit = ColumnLimit - ContentStartColumn + 1;
- StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplit);
+ unsigned MaxSplitBytes = 0;
+
+ for (unsigned NumChars = 0;
+ NumChars < MaxSplit && MaxSplitBytes < Text.size(); ++NumChars)
+ MaxSplitBytes +=
+ encoding::getCodePointNumBytes(Text[MaxSplitBytes], Encoding);
+
+ StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplitBytes);
if (SpaceOffset == StringRef::npos ||
// Don't break at leading whitespace.
Text.find_last_not_of(' ', SpaceOffset) == StringRef::npos) {
// If the comment is only whitespace, we cannot split.
return BreakableToken::Split(StringRef::npos, 0);
SpaceOffset =
- Text.find(' ', std::max<unsigned>(MaxSplit, FirstNonWhitespace));
+ Text.find(' ', std::max<unsigned>(MaxSplitBytes, FirstNonWhitespace));
}
if (SpaceOffset != StringRef::npos && SpaceOffset != 0) {
StringRef BeforeCut = Text.substr(0, SpaceOffset).rtrim();
BreakableToken::Split getStringSplit(StringRef Text,
unsigned ContentStartColumn,
- unsigned ColumnLimit) {
-
- if (ColumnLimit <= ContentStartColumn)
- return BreakableToken::Split(StringRef::npos, 0);
- unsigned MaxSplit = ColumnLimit - ContentStartColumn;
+ unsigned ColumnLimit,
+ encoding::Encoding Encoding) {
// FIXME: Reduce unit test case.
if (Text.empty())
return BreakableToken::Split(StringRef::npos, 0);
- MaxSplit = std::min<unsigned>(MaxSplit, Text.size() - 1);
- StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplit);
- if (SpaceOffset != StringRef::npos && SpaceOffset != 0)
+ if (ColumnLimit <= ContentStartColumn)
+ return BreakableToken::Split(StringRef::npos, 0);
+ unsigned MaxSplit =
+ std::min<unsigned>(ColumnLimit - ContentStartColumn,
+ encoding::getCodePointCount(Text, Encoding) - 1);
+ StringRef::size_type SpaceOffset = 0;
+ StringRef::size_type SlashOffset = 0;
+ StringRef::size_type SplitPoint = 0;
+ for (unsigned Chars = 0;;) {
+ unsigned Advance;
+ if (Text[0] == '\\') {
+ Advance = encoding::getEscapeSequenceLength(Text);
+ Chars += Advance;
+ } else {
+ Advance = encoding::getCodePointNumBytes(Text[0], Encoding);
+ Chars += 1;
+ }
+
+ if (Chars > MaxSplit)
+ break;
+
+ if (Text[0] == ' ')
+ SpaceOffset = SplitPoint;
+ if (Text[0] == '/')
+ SlashOffset = SplitPoint;
+
+ SplitPoint += Advance;
+ Text = Text.substr(Advance);
+ }
+
+ if (SpaceOffset != 0)
return BreakableToken::Split(SpaceOffset + 1, 0);
- StringRef::size_type SlashOffset = Text.rfind('/', MaxSplit);
- if (SlashOffset != StringRef::npos && SlashOffset != 0)
+ if (SlashOffset != 0)
return BreakableToken::Split(SlashOffset + 1, 0);
- StringRef::size_type SplitPoint = getStartOfCharacter(Text, MaxSplit);
- if (SplitPoint == StringRef::npos || SplitPoint == 0)
- return BreakableToken::Split(StringRef::npos, 0);
- return BreakableToken::Split(SplitPoint, 0);
+ if (SplitPoint != 0)
+ return BreakableToken::Split(SplitPoint, 0);
+ return BreakableToken::Split(StringRef::npos, 0);
}
} // namespace
unsigned
BreakableSingleLineToken::getLineLengthAfterSplit(unsigned LineIndex,
unsigned TailOffset) const {
- return StartColumn + Prefix.size() + Postfix.size() + Line.size() -
- TailOffset;
+ return StartColumn + Prefix.size() + Postfix.size() +
+ encoding::getCodePointCount(Line.substr(TailOffset), Encoding);
}
void BreakableSingleLineToken::insertBreak(unsigned LineIndex,
BreakableSingleLineToken::BreakableSingleLineToken(const FormatToken &Tok,
unsigned StartColumn,
StringRef Prefix,
- StringRef Postfix)
- : BreakableToken(Tok), StartColumn(StartColumn), Prefix(Prefix),
+ StringRef Postfix,
+ encoding::Encoding Encoding)
+ : BreakableToken(Tok, Encoding), StartColumn(StartColumn), Prefix(Prefix),
Postfix(Postfix) {
assert(Tok.TokenText.startswith(Prefix) && Tok.TokenText.endswith(Postfix));
Line = Tok.TokenText.substr(
}
BreakableStringLiteral::BreakableStringLiteral(const FormatToken &Tok,
- unsigned StartColumn)
- : BreakableSingleLineToken(Tok, StartColumn, "\"", "\"") {}
+ unsigned StartColumn,
+ encoding::Encoding Encoding)
+ : BreakableSingleLineToken(Tok, StartColumn, "\"", "\"", Encoding) {}
BreakableToken::Split
BreakableStringLiteral::getSplit(unsigned LineIndex, unsigned TailOffset,
unsigned ColumnLimit) const {
- return getStringSplit(Line.substr(TailOffset), StartColumn + 2, ColumnLimit);
+ return getStringSplit(Line.substr(TailOffset), StartColumn + 2, ColumnLimit,
+ Encoding);
}
static StringRef getLineCommentPrefix(StringRef Comment) {
}
BreakableLineComment::BreakableLineComment(const FormatToken &Token,
- unsigned StartColumn)
+ unsigned StartColumn,
+ encoding::Encoding Encoding)
: BreakableSingleLineToken(Token, StartColumn,
- getLineCommentPrefix(Token.TokenText), "") {}
+ getLineCommentPrefix(Token.TokenText), "",
+ Encoding) {}
BreakableToken::Split
BreakableLineComment::getSplit(unsigned LineIndex, unsigned TailOffset,
unsigned ColumnLimit) const {
return getCommentSplit(Line.substr(TailOffset), StartColumn + Prefix.size(),
- ColumnLimit);
+ ColumnLimit, Encoding);
}
-BreakableBlockComment::BreakableBlockComment(const FormatStyle &Style,
- const FormatToken &Token,
- unsigned StartColumn,
- unsigned OriginalStartColumn,
- bool FirstInLine)
- : BreakableToken(Token) {
+BreakableBlockComment::BreakableBlockComment(
+ const FormatStyle &Style, const FormatToken &Token, unsigned StartColumn,
+ unsigned OriginalStartColumn, bool FirstInLine, encoding::Encoding Encoding)
+ : BreakableToken(Token, Encoding) {
StringRef TokenText(Token.TokenText);
assert(TokenText.startswith("/*") && TokenText.endswith("*/"));
TokenText.substr(2, TokenText.size() - 4).split(Lines, "\n");
BreakableBlockComment::getLineLengthAfterSplit(unsigned LineIndex,
unsigned TailOffset) const {
return getContentStartColumn(LineIndex, TailOffset) +
- (Lines[LineIndex].size() - TailOffset) +
+ encoding::getCodePointCount(Lines[LineIndex].substr(TailOffset),
+ Encoding) +
// The last line gets a "*/" postfix.
(LineIndex + 1 == Lines.size() ? 2 : 0);
}
unsigned ColumnLimit) const {
return getCommentSplit(Lines[LineIndex].substr(TailOffset),
getContentStartColumn(LineIndex, TailOffset),
- ColumnLimit);
+ ColumnLimit, Encoding);
}
void BreakableBlockComment::insertBreak(unsigned LineIndex, unsigned TailOffset,
#ifndef LLVM_CLANG_FORMAT_BREAKABLETOKEN_H
#define LLVM_CLANG_FORMAT_BREAKABLETOKEN_H
+#include "Encoding.h"
#include "TokenAnnotator.h"
#include "WhitespaceManager.h"
#include <utility>
WhitespaceManager &Whitespaces) {}
protected:
- BreakableToken(const FormatToken &Tok) : Tok(Tok) {}
+ BreakableToken(const FormatToken &Tok, encoding::Encoding Encoding)
+ : Tok(Tok), Encoding(Encoding) {}
const FormatToken &Tok;
+ encoding::Encoding Encoding;
};
/// \brief Base class for single line tokens that can be broken.
protected:
BreakableSingleLineToken(const FormatToken &Tok, unsigned StartColumn,
- StringRef Prefix, StringRef Postfix);
+ StringRef Prefix, StringRef Postfix,
+ encoding::Encoding Encoding);
// The column in which the token starts.
unsigned StartColumn;
///
/// \p StartColumn specifies the column in which the token will start
/// after formatting.
- BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn);
+ BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
+ encoding::Encoding Encoding);
virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
unsigned ColumnLimit) const;
///
/// \p StartColumn specifies the column in which the comment will start
/// after formatting.
- BreakableLineComment(const FormatToken &Token, unsigned StartColumn);
+ BreakableLineComment(const FormatToken &Token, unsigned StartColumn,
+ encoding::Encoding Encoding);
virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
unsigned ColumnLimit) const;
/// If the comment starts a line after formatting, set \p FirstInLine to true.
BreakableBlockComment(const FormatStyle &Style, const FormatToken &Token,
unsigned StartColumn, unsigned OriginaStartColumn,
- bool FirstInLine);
+ bool FirstInLine, encoding::Encoding Encoding);
virtual unsigned getLineCount() const;
virtual unsigned getLineLengthAfterSplit(unsigned LineIndex,
--- /dev/null
+//===--- Encoding.h - Format C++ code -------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Contains functions for text encoding manipulation. Supports UTF-8,
+/// 8-bit encodings and escape sequences in C++ string literals.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_FORMAT_ENCODING_H
+#define LLVM_CLANG_FORMAT_ENCODING_H
+
+#include "clang/Basic/LLVM.h"
+#include "llvm/Support/ConvertUTF.h"
+
+namespace clang {
+namespace format {
+namespace encoding {
+
+enum Encoding {
+ Encoding_UTF8,
+ Encoding_Unknown // We treat all other encodings as 8-bit encodings.
+};
+
+/// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8,
+/// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
+inline Encoding detectEncoding(StringRef Text) {
+ const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin());
+ const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end());
+ if (::isLegalUTF8String(&Ptr, BufEnd))
+ return Encoding_UTF8;
+ return Encoding_Unknown;
+}
+
+inline unsigned getCodePointCountUTF8(StringRef Text) {
+ unsigned CodePoints = 0;
+ for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) {
+ ++CodePoints;
+ }
+ return CodePoints;
+}
+
+/// \brief Gets the number of code points in the Text using the specified
+/// Encoding.
+inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
+ switch (Encoding) {
+ case Encoding_UTF8:
+ return getCodePointCountUTF8(Text);
+ default:
+ return Text.size();
+ }
+}
+
+/// \brief Gets the number of bytes in a sequence representing a single
+/// codepoint and starting with FirstChar in the specified Encoding.
+inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
+ switch (Encoding) {
+ case Encoding_UTF8:
+ return getNumBytesForUTF8(FirstChar);
+ default:
+ return 1;
+ }
+}
+
+inline bool isOctDigit(char c) {
+ return '0' <= c && c <= '7';
+}
+
+inline bool isHexDigit(char c) {
+ return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
+ ('A' <= c && c <= 'F');
+}
+
+/// \brief Gets the length of an escape sequence inside a C++ string literal.
+/// Text should span from the beginning of the escape sequence (starting with a
+/// backslash) to the end of the string literal.
+inline unsigned getEscapeSequenceLength(StringRef Text) {
+ assert(Text[0] == '\\');
+ if (Text.size() < 2)
+ return 1;
+
+ switch (Text[1]) {
+ case 'u':
+ return 6;
+ case 'U':
+ return 10;
+ case 'x': {
+ unsigned I = 2; // Point after '\x'.
+ while (I < Text.size() && isHexDigit(Text[I]))
+ ++I;
+ return I;
+ }
+ default:
+ if (isOctDigit(Text[1])) {
+ unsigned I = 1;
+ while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
+ ++I;
+ return I;
+ }
+ return 2;
+ }
+}
+
+} // namespace encoding
+} // namespace format
+} // namespace clang
+
+#endif // LLVM_CLANG_FORMAT_ENCODING_H
UnwrappedLineFormatter(const FormatStyle &Style, SourceManager &SourceMgr,
const AnnotatedLine &Line, unsigned FirstIndent,
const FormatToken *RootToken,
- WhitespaceManager &Whitespaces)
+ WhitespaceManager &Whitespaces,
+ encoding::Encoding Encoding)
: Style(Style), SourceMgr(SourceMgr), Line(Line),
FirstIndent(FirstIndent), RootToken(RootToken),
- Whitespaces(Whitespaces), Count(0) {}
+ Whitespaces(Whitespaces), Count(0), Encoding(Encoding) {}
/// \brief Formats an \c UnwrappedLine.
void format(const AnnotatedLine *NextLine) {
State.NextToken->WhitespaceRange.getEnd()) -
SourceMgr.getSpellingColumnNumber(
State.NextToken->WhitespaceRange.getBegin());
- State.Column += WhitespaceLength + State.NextToken->TokenLength;
+ State.Column += WhitespaceLength + State.NextToken->CodePointCount;
State.NextToken = State.NextToken->Next;
return 0;
}
Line.StartsDefinition)) {
State.Column = State.Stack.back().Indent;
} else if (Current.Type == TT_ObjCSelectorName) {
- if (State.Stack.back().ColonPos > Current.TokenLength) {
- State.Column = State.Stack.back().ColonPos - Current.TokenLength;
+ if (State.Stack.back().ColonPos > Current.CodePointCount) {
+ State.Column = State.Stack.back().ColonPos - Current.CodePointCount;
} else {
State.Column = State.Stack.back().Indent;
- State.Stack.back().ColonPos = State.Column + Current.TokenLength;
+ State.Stack.back().ColonPos = State.Column + Current.CodePointCount;
}
} else if (Current.Type == TT_StartOfName ||
Previous.isOneOf(tok::coloncolon, tok::equal) ||
State.Stack.back().LastSpace = State.Column;
if (Current.isOneOf(tok::arrow, tok::period) &&
Current.Type != TT_DesignatedInitializerPeriod)
- State.Stack.back().LastSpace += Current.TokenLength;
+ State.Stack.back().LastSpace += Current.CodePointCount;
State.StartOfLineLevel = State.ParenLevel;
State.LowestCallLevel = State.ParenLevel;
State.Stack.back().VariablePos = State.Column;
// Move over * and & if they are bound to the variable name.
const FormatToken *Tok = &Previous;
- while (Tok && State.Stack.back().VariablePos >= Tok->TokenLength) {
- State.Stack.back().VariablePos -= Tok->TokenLength;
+ while (Tok && State.Stack.back().VariablePos >= Tok->CodePointCount) {
+ State.Stack.back().VariablePos -= Tok->CodePointCount;
if (Tok->SpacesRequiredBefore != 0)
break;
Tok = Tok->Previous;
if (Current.Type == TT_ObjCSelectorName &&
State.Stack.back().ColonPos == 0) {
if (State.Stack.back().Indent + Current.LongestObjCSelectorName >
- State.Column + Spaces + Current.TokenLength)
+ State.Column + Spaces + Current.CodePointCount)
State.Stack.back().ColonPos =
State.Stack.back().Indent + Current.LongestObjCSelectorName;
else
State.Stack.back().ColonPos =
- State.Column + Spaces + Current.TokenLength;
+ State.Column + Spaces + Current.CodePointCount;
}
if (Previous.opensScope() && Previous.Type != TT_ObjCMethodExpr &&
State.LowestCallLevel = std::min(State.LowestCallLevel, State.ParenLevel);
if (Line.Type == LT_BuilderTypeCall && State.ParenLevel == 0)
State.Stack.back().StartOfFunctionCall =
- Current.LastInChainOfCalls ? 0 : State.Column + Current.TokenLength;
+ Current.LastInChainOfCalls ? 0
+ : State.Column + Current.CodePointCount;
}
if (Current.Type == TT_CtorInitializerColon) {
// Indent 2 from the column, so:
State.StartOfStringLiteral = 0;
}
- State.Column += Current.TokenLength;
+ State.Column += Current.CodePointCount;
State.NextToken = State.NextToken->Next;
bool DryRun) {
unsigned UnbreakableTailLength = Current.UnbreakableTailLength;
llvm::OwningPtr<BreakableToken> Token;
- unsigned StartColumn = State.Column - Current.TokenLength;
+ unsigned StartColumn = State.Column - Current.CodePointCount;
unsigned OriginalStartColumn =
SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) -
1;
if (!LiteralData || *LiteralData != '"')
return 0;
- Token.reset(new BreakableStringLiteral(Current, StartColumn));
+ Token.reset(new BreakableStringLiteral(Current, StartColumn, Encoding));
} else if (Current.Type == TT_BlockComment) {
BreakableBlockComment *BBC = new BreakableBlockComment(
- Style, Current, StartColumn, OriginalStartColumn, !Current.Previous);
+ Style, Current, StartColumn, OriginalStartColumn, !Current.Previous,
+ Encoding);
Token.reset(BBC);
} else if (Current.Type == TT_LineComment &&
(Current.Previous == NULL ||
Current.Previous->Type != TT_ImplicitStringLiteral)) {
- Token.reset(new BreakableLineComment(Current, StartColumn));
+ Token.reset(new BreakableLineComment(Current, StartColumn, Encoding));
} else {
return 0;
}
Whitespaces);
}
unsigned TailOffset = 0;
- unsigned RemainingTokenLength =
+ unsigned RemainingTokenColumns =
Token->getLineLengthAfterSplit(LineIndex, TailOffset);
- while (RemainingTokenLength > RemainingSpace) {
+ while (RemainingTokenColumns > RemainingSpace) {
BreakableToken::Split Split =
Token->getSplit(LineIndex, TailOffset, getColumnLimit());
if (Split.first == StringRef::npos)
break;
assert(Split.first != 0);
- unsigned NewRemainingTokenLength = Token->getLineLengthAfterSplit(
+ unsigned NewRemainingTokenColumns = Token->getLineLengthAfterSplit(
LineIndex, TailOffset + Split.first + Split.second);
- assert(NewRemainingTokenLength < RemainingTokenLength);
+ assert(NewRemainingTokenColumns < RemainingTokenColumns);
if (!DryRun) {
Token->insertBreak(LineIndex, TailOffset, Split, Line.InPPDirective,
Whitespaces);
}
TailOffset += Split.first + Split.second;
- RemainingTokenLength = NewRemainingTokenLength;
+ RemainingTokenColumns = NewRemainingTokenColumns;
Penalty += Style.PenaltyExcessCharacter;
BreakInserted = true;
}
- PositionAfterLastLineInToken = RemainingTokenLength;
+ PositionAfterLastLineInToken = RemainingTokenColumns;
}
if (BreakInserted) {
// Increasing count of \c StateNode items we have created. This is used
// to create a deterministic order independent of the container.
unsigned Count;
+ encoding::Encoding Encoding;
};
class FormatTokenLexer {
public:
- FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr)
+ FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr,
+ encoding::Encoding Encoding)
: FormatTok(NULL), GreaterStashed(false), TrailingWhitespace(0), Lex(Lex),
- SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()) {
+ SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()),
+ Encoding(Encoding) {
Lex.SetKeepWhitespaceMode(true);
}
FormatTok->Tok.getLocation().getLocWithOffset(1);
FormatTok->WhitespaceRange =
SourceRange(GreaterLocation, GreaterLocation);
- FormatTok->TokenLength = 1;
+ FormatTok->ByteCount = 1;
+ FormatTok->CodePointCount = 1;
GreaterStashed = false;
return FormatTok;
}
}
// Now FormatTok is the next non-whitespace token.
- FormatTok->TokenLength = Text.size();
+ FormatTok->ByteCount = Text.size();
TrailingWhitespace = 0;
if (FormatTok->Tok.is(tok::comment)) {
TrailingWhitespace = Text.size() - Text.rtrim().size();
- FormatTok->TokenLength -= TrailingWhitespace;
+ FormatTok->ByteCount -= TrailingWhitespace;
}
// In case the token starts with escaped newlines, we want to
while (i + 1 < Text.size() && Text[i] == '\\' && Text[i + 1] == '\n') {
// FIXME: ++FormatTok->NewlinesBefore is missing...
WhitespaceLength += 2;
- FormatTok->TokenLength -= 2;
+ FormatTok->ByteCount -= 2;
i += 2;
}
if (FormatTok->Tok.is(tok::greatergreater)) {
FormatTok->Tok.setKind(tok::greater);
- FormatTok->TokenLength = 1;
+ FormatTok->ByteCount = 1;
GreaterStashed = true;
}
+ unsigned EncodingExtraBytes =
+ Text.size() - encoding::getCodePointCount(Text, Encoding);
+ FormatTok->CodePointCount = FormatTok->ByteCount - EncodingExtraBytes;
+
FormatTok->WhitespaceRange = SourceRange(
WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
FormatTok->TokenText = StringRef(
SourceMgr.getCharacterData(FormatTok->getStartOfNonWhitespace()),
- FormatTok->TokenLength);
+ FormatTok->ByteCount);
return FormatTok;
}
Lexer &Lex;
SourceManager &SourceMgr;
IdentifierTable IdentTable;
+ encoding::Encoding Encoding;
llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
SmallVector<FormatToken *, 16> Tokens;
Formatter(const FormatStyle &Style, Lexer &Lex, SourceManager &SourceMgr,
const std::vector<CharSourceRange> &Ranges)
: Style(Style), Lex(Lex), SourceMgr(SourceMgr),
- Whitespaces(SourceMgr, Style), Ranges(Ranges) {}
+ Whitespaces(SourceMgr, Style), Ranges(Ranges),
+ Encoding(encoding::detectEncoding(Lex.getBuffer())) {
+ DEBUG(llvm::dbgs()
+ << "File encoding: "
+ << (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown")
+ << "\n");
+ }
virtual ~Formatter() {}
tooling::Replacements format() {
- FormatTokenLexer Tokens(Lex, SourceMgr);
+ FormatTokenLexer Tokens(Lex, SourceMgr, Encoding);
UnwrappedLineParser Parser(Style, Tokens.lex(), *this);
bool StructuralError = Parser.parse();
- TokenAnnotator Annotator(Style, SourceMgr, Lex,
- Tokens.getIdentTable().get("in"));
+ TokenAnnotator Annotator(Style, Tokens.getIdentTable().get("in"));
for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) {
Annotator.annotate(AnnotatedLines[i]);
}
1;
}
UnwrappedLineFormatter Formatter(Style, SourceMgr, TheLine, Indent,
- TheLine.First, Whitespaces);
+ TheLine.First, Whitespaces, Encoding);
Formatter.format(I + 1 != E ? &*(I + 1) : NULL);
IndentForLevel[TheLine.Level] = LevelIndent;
PreviousLineWasTouched = true;
CharSourceRange LineRange = CharSourceRange::getCharRange(
First->WhitespaceRange.getBegin().getLocWithOffset(
First->LastNewlineOffset),
- Last->Tok.getLocation().getLocWithOffset(Last->TokenLength - 1));
+ Last->Tok.getLocation().getLocWithOffset(Last->ByteCount - 1));
return touchesRanges(LineRange);
}
WhitespaceManager Whitespaces;
std::vector<CharSourceRange> Ranges;
std::vector<AnnotatedLine> AnnotatedLines;
+
+ encoding::Encoding Encoding;
};
tooling::Replacements reformat(const FormatStyle &Style, Lexer &Lex,
struct FormatToken {
FormatToken()
: NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0),
- TokenLength(0), IsFirst(false), MustBreakBefore(false),
- Type(TT_Unknown), SpacesRequiredBefore(0), CanBreakBefore(false),
- ClosesTemplateDeclaration(false), ParameterCount(0), TotalLength(0),
- UnbreakableTailLength(0), BindingStrength(0), SplitPenalty(0),
- LongestObjCSelectorName(0), FakeRParens(0), LastInChainOfCalls(false),
+ ByteCount(0), CodePointCount(0), IsFirst(false),
+ MustBreakBefore(false), Type(TT_Unknown), SpacesRequiredBefore(0),
+ CanBreakBefore(false), ClosesTemplateDeclaration(false),
+ ParameterCount(0), TotalLength(0), UnbreakableTailLength(0),
+ BindingStrength(0), SplitPenalty(0), LongestObjCSelectorName(0),
+ FakeRParens(0), LastInChainOfCalls(false),
PartOfMultiVariableDeclStmt(false), MatchingParen(NULL), Previous(NULL),
Next(NULL) {}
/// whitespace (relative to \c WhiteSpaceStart). 0 if there is no '\n'.
unsigned LastNewlineOffset;
- /// \brief The length of the non-whitespace parts of the token. This is
- /// necessary because we need to handle escaped newlines that are stored
+ /// \brief The number of bytes of the non-whitespace parts of the token. This
+ /// is necessary because we need to handle escaped newlines that are stored
/// with the token.
- unsigned TokenLength;
+ unsigned ByteCount;
+
+ /// \brief The length of the non-whitespace parts of the token in CodePoints.
+ /// We need this to correctly measure number of columns a token spans.
+ unsigned CodePointCount;
/// \brief Indicates that this is the first token.
bool IsFirst;
#include "TokenAnnotator.h"
#include "clang/Basic/SourceManager.h"
-#include "clang/Lex/Lexer.h"
#include "llvm/Support/Debug.h"
namespace clang {
/// into template parameter lists.
class AnnotatingParser {
public:
- AnnotatingParser(SourceManager &SourceMgr, Lexer &Lex, AnnotatedLine &Line,
- IdentifierInfo &Ident_in)
- : SourceMgr(SourceMgr), Lex(Lex), Line(Line), CurrentToken(Line.First),
- KeywordVirtualFound(false), NameFound(false), Ident_in(Ident_in) {
+ AnnotatingParser(AnnotatedLine &Line, IdentifierInfo &Ident_in)
+ : Line(Line), CurrentToken(Line.First), KeywordVirtualFound(false),
+ NameFound(false), Ident_in(Ident_in) {
Contexts.push_back(Context(tok::unknown, 1, /*IsExpression=*/ false));
}
Line.First->Type == TT_ObjCMethodSpecifier) {
Tok->Type = TT_ObjCMethodExpr;
Tok->Previous->Type = TT_ObjCSelectorName;
- if (Tok->Previous->TokenLength >
- Contexts.back().LongestObjCSelectorName)
- Contexts.back().LongestObjCSelectorName = Tok->Previous->TokenLength;
+ if (Tok->Previous->CodePointCount >
+ Contexts.back().LongestObjCSelectorName) {
+ Contexts.back().LongestObjCSelectorName =
+ Tok->Previous->CodePointCount;
+ }
if (Contexts.back().FirstObjCSelectorName == NULL)
Contexts.back().FirstObjCSelectorName = Tok->Previous;
} else if (Contexts.back().ColonIsForRangeExpr) {
} else if (Current.isBinaryOperator()) {
Current.Type = TT_BinaryOperator;
} else if (Current.is(tok::comment)) {
- std::string Data(
- Lexer::getSpelling(Current.Tok, SourceMgr, Lex.getLangOpts()));
- if (StringRef(Data).startswith("//"))
+ if (Current.TokenText.startswith("//"))
Current.Type = TT_LineComment;
else
Current.Type = TT_BlockComment;
case tok::kw_wchar_t:
case tok::kw_bool:
case tok::kw___underlying_type:
- return true;
case tok::annot_typename:
case tok::kw_char16_t:
case tok::kw_char32_t:
case tok::kw_typeof:
case tok::kw_decltype:
- return Lex.getLangOpts().CPlusPlus;
+ return true;
default:
- break;
+ return false;
}
- return false;
}
SmallVector<Context, 8> Contexts;
- SourceManager &SourceMgr;
- Lexer &Lex;
AnnotatedLine &Line;
FormatToken *CurrentToken;
bool KeywordVirtualFound;
};
void TokenAnnotator::annotate(AnnotatedLine &Line) {
- AnnotatingParser Parser(SourceMgr, Lex, Line, Ident_in);
+ AnnotatingParser Parser(Line, Ident_in);
Line.Type = Parser.parseLine();
if (Line.Type == LT_Invalid)
return;
}
void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) {
- Line.First->TotalLength = Line.First->TokenLength;
+ Line.First->TotalLength = Line.First->CodePointCount;
if (!Line.First->Next)
return;
FormatToken *Current = Line.First->Next;
Current->TotalLength = Current->Previous->TotalLength + Style.ColumnLimit;
else
Current->TotalLength =
- Current->Previous->TotalLength + Current->TokenLength +
+ Current->Previous->TotalLength + Current->CodePointCount +
Current->SpacesRequiredBefore;
// FIXME: Only calculate this if CanBreakBefore is true once static
// initializers etc. are sorted out.
UnbreakableTailLength = 0;
} else {
UnbreakableTailLength +=
- Current->TokenLength + Current->SpacesRequiredBefore;
+ Current->CodePointCount + Current->SpacesRequiredBefore;
}
Current = Current->Previous;
}
if (Right.is(tok::lessless)) {
if (Left.is(tok::string_literal)) {
- StringRef Content =
- StringRef(Left.Tok.getLiteralData(), Left.TokenLength);
+ StringRef Content = Left.TokenText;
Content = Content.drop_back(1).drop_front(1).trim();
if (Content.size() > 1 &&
(Content.back() == ':' || Content.back() == '='))
#include <string>
namespace clang {
-class Lexer;
class SourceManager;
namespace format {
/// \c UnwrappedLine.
class TokenAnnotator {
public:
- TokenAnnotator(const FormatStyle &Style, SourceManager &SourceMgr, Lexer &Lex,
- IdentifierInfo &Ident_in)
- : Style(Style), SourceMgr(SourceMgr), Lex(Lex), Ident_in(Ident_in) {
- }
+ TokenAnnotator(const FormatStyle &Style, IdentifierInfo &Ident_in)
+ : Style(Style), Ident_in(Ident_in) {}
void annotate(AnnotatedLine &Line);
void calculateFormattingInformation(AnnotatedLine &Line);
void calculateUnbreakableTailLengths(AnnotatedLine &Line);
const FormatStyle &Style;
- SourceManager &SourceMgr;
- Lexer &Lex;
// Contextual keywords:
IdentifierInfo &Ident_in;
EXPECT_EQ(Style, ParsedStyle);
}
+TEST_F(FormatTest, WorksFor8bitEncodings) {
+ EXPECT_EQ("\"\xce\xe4\xed\xe0\xe6\xe4\xfb \xe2 \"\n"
+ "\"\xf1\xf2\xf3\xe4\xb8\xed\xf3\xfe \"\n"
+ "\"\xe7\xe8\xec\xed\xfe\xfe \"\n"
+ "\"\xef\xee\xf0\xf3...\"",
+ format("\"\xce\xe4\xed\xe0\xe6\xe4\xfb \xe2 "
+ "\xf1\xf2\xf3\xe4\xb8\xed\xf3\xfe \xe7\xe8\xec\xed\xfe\xfe "
+ "\xef\xee\xf0\xf3...\"",
+ getLLVMStyleWithColumns(12)));
+}
+
+TEST_F(FormatTest, CountsUTF8CharactersProperly) {
+ verifyFormat("\"Однажды в студёную зимнюю пору...\"",
+ getLLVMStyleWithColumns(35));
+ verifyFormat("\"一 二 三 四 五 六 七 八 九 十\"",
+ getLLVMStyleWithColumns(21));
+ verifyFormat("// Однажды в студёную зимнюю пору...",
+ getLLVMStyleWithColumns(36));
+ verifyFormat("// 一 二 三 四 五 六 七 八 九 十",
+ getLLVMStyleWithColumns(22));
+ verifyFormat("/* Однажды в студёную зимнюю пору... */",
+ getLLVMStyleWithColumns(39));
+ verifyFormat("/* 一 二 三 四 五 六 七 八 九 十 */",
+ getLLVMStyleWithColumns(25));
+}
+
+TEST_F(FormatTest, SplitsUTF8Strings) {
+ EXPECT_EQ(
+ "\"Однажды, в \"\n"
+ "\"студёную \"\n"
+ "\"зимнюю \"\n"
+ "\"пору,\"",
+ format("\"Однажды, в студёную зимнюю пору,\"",
+ getLLVMStyleWithColumns(13)));
+ EXPECT_EQ("\"一 二 三 四 \"\n"
+ "\"五 六 七 八 \"\n"
+ "\"九 十\"",
+ format("\"一 二 三 四 五 六 七 八 九 十\"",
+ getLLVMStyleWithColumns(10)));
+}
+
+TEST_F(FormatTest, SplitsUTF8LineComments) {
+ EXPECT_EQ("// Я из лесу\n"
+ "// вышел; был\n"
+ "// сильный\n"
+ "// мороз.",
+ format("// Я из лесу вышел; был сильный мороз.",
+ getLLVMStyleWithColumns(13)));
+ EXPECT_EQ("// 一二三\n"
+ "// 四五六七\n"
+ "// 八\n"
+ "// 九 十",
+ format("// 一二三 四五六七 八 九 十", getLLVMStyleWithColumns(6)));
+}
+
+TEST_F(FormatTest, SplitsUTF8BlockComments) {
+ EXPECT_EQ("/* Гляжу,\n"
+ " * поднимается\n"
+ " * медленно в\n"
+ " * гору\n"
+ " * Лошадка,\n"
+ " * везущая\n"
+ " * хворосту\n"
+ " * воз. */",
+ format("/* Гляжу, поднимается медленно в гору\n"
+ " * Лошадка, везущая хворосту воз. */",
+ getLLVMStyleWithColumns(13)));
+ EXPECT_EQ("/* 一二三\n"
+ " * 四五六七\n"
+ " * 八\n"
+ " * 九 十\n"
+ " */",
+ format("/* 一二三 四五六七 八 九 十 */", getLLVMStyleWithColumns(6)));
+}
+
} // end namespace tooling
} // end namespace clang