From: Richard Smith Date: Wed, 13 Jun 2012 05:37:23 +0000 (+0000) Subject: PR13099: Teach -Wformat about raw string literals, UTF-8 strings and Unicode escape... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=df9ef1bc8c3780307ab2ed81bf5e31c23310b936;p=clang PR13099: Teach -Wformat about raw string literals, UTF-8 strings and Unicode escape sequences. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@158390 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/AST/Expr.cpp b/lib/AST/Expr.cpp index 9d7a93a429..d3f6a521f1 100644 --- a/lib/AST/Expr.cpp +++ b/lib/AST/Expr.cpp @@ -679,7 +679,8 @@ void StringLiteral::setString(ASTContext &C, StringRef Str, SourceLocation StringLiteral:: getLocationOfByte(unsigned ByteNo, const SourceManager &SM, const LangOptions &Features, const TargetInfo &Target) const { - assert(Kind == StringLiteral::Ascii && "This only works for ASCII strings"); + assert((Kind == StringLiteral::Ascii || Kind == StringLiteral::UTF8) && + "Only narrow string literals are currently supported"); // Loop over all of the tokens in this string until we find the one that // contains the byte we're looking for. diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp index c7120f2bef..2930d6a5ff 100644 --- a/lib/Lex/LiteralSupport.cpp +++ b/lib/Lex/LiteralSupport.cpp @@ -250,6 +250,39 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, return true; } +/// MeasureUCNEscape - Determine the number of bytes within the resulting string +/// which this UCN will occupy. +static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, + const char *ThisTokEnd, unsigned CharByteWidth, + const LangOptions &Features, bool &HadError) { + // UTF-32: 4 bytes per escape. + if (CharByteWidth == 4) + return 4; + + uint32_t UcnVal = 0; + unsigned short UcnLen = 0; + FullSourceLoc Loc; + + if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, + UcnLen, Loc, 0, Features, true)) { + HadError = true; + return 0; + } + + // UTF-16: 2 bytes for BMP, 4 bytes otherwise. + if (CharByteWidth == 2) + return UcnVal <= 0xFFFF ? 2 : 4; + + // UTF-8. + if (UcnVal < 0x80) + return 1; + if (UcnVal < 0x800) + return 2; + if (UcnVal < 0x10000) + return 3; + return 4; +} + /// EncodeUCNEscape - Read the Universal Character Name, check constraints and /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of /// StringLiteralParser. When we decide to implement UCN's for identifiers, @@ -265,7 +298,7 @@ static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, unsigned short UcnLen = 0; if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, Loc, Diags, Features, true)) { - HadError = 1; + HadError = true; return; } @@ -1369,14 +1402,31 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, if (StringInvalid) return 0; + const char *SpellingStart = SpellingPtr; + const char *SpellingEnd = SpellingPtr+TokLen; + + // Handle UTF-8 strings just like narrow strings. + if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8') + SpellingPtr += 2; + assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' && SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet"); + // For raw string literals, this is easy. + if (SpellingPtr[0] == 'R') { + assert(SpellingPtr[1] == '"' && "Should be a raw string literal!"); + // Skip 'R"'. + SpellingPtr += 2; + while (*SpellingPtr != '(') { + ++SpellingPtr; + assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal"); + } + // Skip '('. + ++SpellingPtr; + return SpellingPtr - SpellingStart + ByteNo; + } - const char *SpellingStart = SpellingPtr; - const char *SpellingEnd = SpellingPtr+TokLen; - - // Skip over the leading quote. + // Skip over the leading quote assert(SpellingPtr[0] == '"' && "Should be a string literal!"); ++SpellingPtr; @@ -1393,11 +1443,23 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, // Otherwise, this is an escape character. Advance over it. bool HadError = false; - ProcessCharEscape(SpellingPtr, SpellingEnd, HadError, - FullSourceLoc(Tok.getLocation(), SM), - CharByteWidth*8, Diags); + if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') { + const char *EscapePtr = SpellingPtr; + unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd, + 1, Features, HadError); + if (Len > ByteNo) { + // ByteNo is somewhere within the escape sequence. + SpellingPtr = EscapePtr; + break; + } + ByteNo -= Len; + } else { + ProcessCharEscape(SpellingPtr, SpellingEnd, HadError, + FullSourceLoc(Tok.getLocation(), SM), + CharByteWidth*8, Diags); + --ByteNo; + } assert(!HadError && "This method isn't valid on erroneous strings"); - --ByteNo; } return SpellingPtr-SpellingStart; diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp index e818f5f3e0..06035921d9 100644 --- a/lib/Sema/SemaChecking.cpp +++ b/lib/Sema/SemaChecking.cpp @@ -2633,7 +2633,7 @@ void Sema::CheckFormatString(const StringLiteral *FExpr, bool inFunctionCall) { // CHECK: is the format string a wide literal? - if (!FExpr->isAscii()) { + if (!FExpr->isAscii() && !FExpr->isUTF8()) { CheckFormatHandler::EmitFormatDiagnostic( *this, inFunctionCall, Args[format_idx], PDiag(diag::warn_format_string_is_wide_literal), FExpr->getLocStart(), diff --git a/test/SemaCXX/format-strings-0x.cpp b/test/SemaCXX/format-strings-0x.cpp index e7c5904c66..7b3aef1ee5 100644 --- a/test/SemaCXX/format-strings-0x.cpp +++ b/test/SemaCXX/format-strings-0x.cpp @@ -12,4 +12,16 @@ void f(char **sp, float *fp) { scanf("%afoobar", fp); printf(nullptr); printf(*sp); // expected-warning {{not a string literal}} + + // PR13099 + printf( + R"foobar(%)foobar" + R"bazquux(d)bazquux" // expected-warning {{more '%' conversions than data arguments}} + R"xyzzy()xyzzy"); + + printf(u8"this is %d test", 0); // ok + printf(u8R"foo( + \u1234\U0010fffe + %d)foo" // expected-warning {{more '%' conversions than data arguments}} + ); }