From: Alexander Kornienko Date: Wed, 13 Nov 2013 14:04:17 +0000 (+0000) Subject: Correctly mark first token in the presence of UTF-8 BOM. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a9f280942e3129a3d9e051bcf1bb5616243f212c;p=clang Correctly mark first token in the presence of UTF-8 BOM. Summary: Fixes http://llvm.org/PR17753 Reviewers: klimek Reviewed By: klimek CC: cfe-commits, klimek Differential Revision: http://llvm-reviews.chandlerc.com/D2159 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@194576 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp index a5fc83e84e..64ffe9e8f9 100644 --- a/lib/Format/Format.cpp +++ b/lib/Format/Format.cpp @@ -996,7 +996,7 @@ class FormatTokenLexer { public: FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr, FormatStyle &Style, encoding::Encoding Encoding) - : FormatTok(NULL), GreaterStashed(false), Column(0), + : FormatTok(NULL), IsFirstToken(true), GreaterStashed(false), Column(0), TrailingWhitespace(0), Lex(Lex), SourceMgr(SourceMgr), Style(Style), IdentTable(getFormattingLangOpts()), Encoding(Encoding) { Lex.SetKeepWhitespaceMode(true); @@ -1069,8 +1069,8 @@ private: readRawToken(*FormatTok); SourceLocation WhitespaceStart = FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace); - if (SourceMgr.getFileOffset(WhitespaceStart) == 0) - FormatTok->IsFirst = true; + FormatTok->IsFirst = IsFirstToken; + IsFirstToken = false; // Consume and record whitespace until we find a significant token. unsigned WhitespaceLength = TrailingWhitespace; @@ -1181,6 +1181,7 @@ private: } FormatToken *FormatTok; + bool IsFirstToken; bool GreaterStashed; unsigned Column; unsigned TrailingWhitespace; diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp index f2ac3ba94f..b6574c7503 100644 --- a/unittests/Format/FormatTest.cpp +++ b/unittests/Format/FormatTest.cpp @@ -6911,6 +6911,14 @@ TEST_F(FormatTest, WorksFor8bitEncodings) { getLLVMStyleWithColumns(12))); } +TEST_F(FormatTest, HandlesUTF8BOM) { + EXPECT_EQ("\xef\xbb\xbf", format("\xef\xbb\xbf")); + EXPECT_EQ("\xef\xbb\xbf#include ", + format("\xef\xbb\xbf#include ")); + EXPECT_EQ("\xef\xbb\xbf\n#include ", + format("\xef\xbb\xbf\n#include ")); +} + // FIXME: Encode Cyrillic and CJK characters below to appease MS compilers. #if !defined(_MSC_VER)