From 8f2f047ed53ada7a4ebc35249c668912b1fb1968 Mon Sep 17 00:00:00 2001 From: Alexander Kornienko Date: Tue, 26 Nov 2013 10:38:53 +0000 Subject: [PATCH] Fix crash in getStringSplit. Summary: getStringSplit used to crash, when trying to split a long string literal containing both printable and unprintable multi-byte UTF-8 characters. Reviewers: djasper, klimek Reviewed By: djasper CC: cfe-commits, klimek Differential Revision: http://llvm-reviews.chandlerc.com/D2268 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@195728 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Format/BreakableToken.cpp | 6 ++---- lib/Format/Encoding.h | 8 +++++--- unittests/Format/FormatTest.cpp | 12 ++++++++++++ 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/lib/Format/BreakableToken.cpp b/lib/Format/BreakableToken.cpp index d720ce990b..a08102a3b7 100644 --- a/lib/Format/BreakableToken.cpp +++ b/lib/Format/BreakableToken.cpp @@ -92,9 +92,7 @@ static BreakableToken::Split getStringSplit(StringRef Text, return BreakableToken::Split(StringRef::npos, 0); if (ColumnLimit <= UsedColumns) return BreakableToken::Split(StringRef::npos, 0); - unsigned MaxSplit = std::min( - ColumnLimit - UsedColumns, - encoding::columnWidthWithTabs(Text, UsedColumns, TabWidth, Encoding) - 1); + unsigned MaxSplit = ColumnLimit - UsedColumns; StringRef::size_type SpaceOffset = 0; StringRef::size_type SlashOffset = 0; StringRef::size_type WordStartOffset = 0; @@ -110,7 +108,7 @@ static BreakableToken::Split getStringSplit(StringRef Text, Text.substr(0, Advance), UsedColumns + Chars, TabWidth, Encoding); } - if (Chars > MaxSplit) + if (Chars > MaxSplit || Text.size() == Advance) break; if (IsBlank(Text[0])) diff --git a/lib/Format/Encoding.h b/lib/Format/Encoding.h index 356334d537..dba5174b97 100644 --- a/lib/Format/Encoding.h +++ b/lib/Format/Encoding.h @@ -64,6 +64,10 @@ inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) { inline unsigned columnWidth(StringRef Text, Encoding Encoding) { if (Encoding == Encoding_UTF8) { int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text); + // FIXME: Figure out the correct way to handle this in the presence of both + // printable and unprintable multi-byte UTF-8 characters. Falling back to + // returning the number of bytes may cause problems, as columnWidth suddenly + // becomes non-additive. if (ContentWidth >= 0) return ContentWidth; } @@ -81,9 +85,7 @@ inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, StringRef::size_type TabPos = Tail.find('\t'); if (TabPos == StringRef::npos) return TotalWidth + columnWidth(Tail, Encoding); - int Width = columnWidth(Tail.substr(0, TabPos), Encoding); - assert(Width >= 0); - TotalWidth += Width; + TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding); TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth; Tail = Tail.substr(TabPos + 1); } diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp index fc0e935037..a7dce3b86f 100644 --- a/unittests/Format/FormatTest.cpp +++ b/unittests/Format/FormatTest.cpp @@ -6991,6 +6991,16 @@ TEST_F(FormatTest, CountsUTF8CharactersProperly) { } TEST_F(FormatTest, SplitsUTF8Strings) { + // Non-printable characters' width is currently considered to be the length in + // bytes in UTF8. The characters can be displayed in very different manner + // (zero-width, single width with a substitution glyph, expanded to their code + // (e.g. "<8d>"), so there's no single correct way to handle them. + EXPECT_EQ("\"aaaaÄ\"\n" + "\"\";", + format("\"aaaač\";", getLLVMStyleWithColumns(10))); + EXPECT_EQ("\"aaaaaaaÄ\"\n" + "\"\";", + format("\"aaaaaaač\";", getLLVMStyleWithColumns(10))); EXPECT_EQ( "\"Однажды, в \"\n" "\"студёную \"\n" @@ -7024,6 +7034,8 @@ TEST_F(FormatTest, HandlesDoubleWidthCharsInMultiLineStrings) { } TEST_F(FormatTest, SplitsUTF8LineComments) { + EXPECT_EQ("// aaaač", + format("// aaaač", getLLVMStyleWithColumns(10))); EXPECT_EQ("// Я из лесу\n" "// вышел; был\n" "// сильный\n" -- 2.40.0