PR13099: Teach -Wformat about raw string literals, UTF-8 strings and Unicode escape...

author Richard Smith <richard-llvm@metafoo.co.uk>

Wed, 13 Jun 2012 05:37:23 +0000 (05:37 +0000)

committer Richard Smith <richard-llvm@metafoo.co.uk>

Wed, 13 Jun 2012 05:37:23 +0000 (05:37 +0000)
author Richard Smith <richard-llvm@metafoo.co.uk>
Wed, 13 Jun 2012 05:37:23 +0000 (05:37 +0000)
committer Richard Smith <richard-llvm@metafoo.co.uk>
Wed, 13 Jun 2012 05:37:23 +0000 (05:37 +0000)
diff --git a/lib/AST/Expr.cpp b/lib/AST/Expr.cpp

index 9d7a93a429e44887b76ad26067abe33bfaa354f7..d3f6a521f1c7d2314cf76298d8aa4812476cf374 100644 (file)
--- a/lib/AST/Expr.cpp
+++ b/lib/AST/Expr.cpp
@@ -679,7 +679,8 @@ void StringLiteral::setString(ASTContext &C, StringRef Str,
  SourceLocation StringLiteral::
  getLocationOfByte(unsigned ByteNo, const SourceManager &SM,
                    const LangOptions &Features, const TargetInfo &Target) const {
-  assert(Kind == StringLiteral::Ascii && "This only works for ASCII strings");
+  assert((Kind == StringLiteral::Ascii || Kind == StringLiteral::UTF8) &&
+         "Only narrow string literals are currently supported");
  
    // Loop over all of the tokens in this string until we find the one that
    // contains the byte we're looking for.
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp

index c7120f2befb4f99b4d0f41c6a78baf242f85e0a2..2930d6a5ff08f3005b21e5754b1b4d0593c863db 100644 (file)
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -250,6 +250,39 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
    return true;
  }
  
+/// MeasureUCNEscape - Determine the number of bytes within the resulting string
+/// which this UCN will occupy.
+static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
+                            const char *ThisTokEnd, unsigned CharByteWidth,
+                            const LangOptions &Features, bool &HadError) {
+  // UTF-32: 4 bytes per escape.
+  if (CharByteWidth == 4)
+    return 4;
+
+  uint32_t UcnVal = 0;
+  unsigned short UcnLen = 0;
+  FullSourceLoc Loc;
+
+  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
+                        UcnLen, Loc, 0, Features, true)) {
+    HadError = true;
+    return 0;
+  }
+
+  // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
+  if (CharByteWidth == 2)
+    return UcnVal <= 0xFFFF ? 2 : 4;
+
+  // UTF-8.
+  if (UcnVal < 0x80)
+    return 1;
+  if (UcnVal < 0x800)
+    return 2;
+  if (UcnVal < 0x10000)
+    return 3;
+  return 4;
+}
+
  /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
  /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
  /// StringLiteralParser. When we decide to implement UCN's for identifiers,
@@ -265,7 +298,7 @@ static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
    unsigned short UcnLen = 0;
    if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
                          Loc, Diags, Features, true)) {
-    HadError = 1;
+    HadError = true;
      return;
    }
  
@@ -1369,14 +1402,31 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
    if (StringInvalid)
      return 0;
  
+  const char *SpellingStart = SpellingPtr;
+  const char *SpellingEnd = SpellingPtr+TokLen;
+
+  // Handle UTF-8 strings just like narrow strings.
+  if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
+    SpellingPtr += 2;
+
    assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
           SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
  
+  // For raw string literals, this is easy.
+  if (SpellingPtr[0] == 'R') {
+    assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
+    // Skip 'R"'.
+    SpellingPtr += 2;
+    while (*SpellingPtr != '(') {
+      ++SpellingPtr;
+      assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
+    }
+    // Skip '('.
+    ++SpellingPtr;
+    return SpellingPtr - SpellingStart + ByteNo;
+  }
  
-  const char *SpellingStart = SpellingPtr;
-  const char *SpellingEnd = SpellingPtr+TokLen;
-
-  // Skip over the leading quote.
+  // Skip over the leading quote
    assert(SpellingPtr[0] == '"' && "Should be a string literal!");
    ++SpellingPtr;
  
@@ -1393,11 +1443,23 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
  
      // Otherwise, this is an escape character.  Advance over it.
      bool HadError = false;
-    ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
-                      FullSourceLoc(Tok.getLocation(), SM),
-                      CharByteWidth*8, Diags);
+    if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
+      const char *EscapePtr = SpellingPtr;
+      unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
+                                      1, Features, HadError);
+      if (Len > ByteNo) {
+        // ByteNo is somewhere within the escape sequence.
+        SpellingPtr = EscapePtr;
+        break;
+      }
+      ByteNo -= Len;
+    } else {
+      ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
+                        FullSourceLoc(Tok.getLocation(), SM),
+                        CharByteWidth*8, Diags);
+      --ByteNo;
+    }
      assert(!HadError && "This method isn't valid on erroneous strings");
-    --ByteNo;
    }
  
    return SpellingPtr-SpellingStart;
diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp

index e818f5f3e07271731848e6417f311c4b29844277..06035921d988945ad65b59013d07e64c2895e506 100644 (file)
--- a/lib/Sema/SemaChecking.cpp
+++ b/lib/Sema/SemaChecking.cpp
@@ -2633,7 +2633,7 @@ void Sema::CheckFormatString(const StringLiteral *FExpr,
                               bool inFunctionCall) {
    
    // CHECK: is the format string a wide literal?
-  if (!FExpr->isAscii()) {
+  if (!FExpr->isAscii() && !FExpr->isUTF8()) {
      CheckFormatHandler::EmitFormatDiagnostic(
        *this, inFunctionCall, Args[format_idx],
        PDiag(diag::warn_format_string_is_wide_literal), FExpr->getLocStart(),
diff --git a/test/SemaCXX/format-strings-0x.cpp b/test/SemaCXX/format-strings-0x.cpp

index e7c5904c66e371e75f4085b17c563e29dfce4eeb..7b3aef1ee5da65dfd7a1856a84e7d09807d53807 100644 (file)
--- a/test/SemaCXX/format-strings-0x.cpp
+++ b/test/SemaCXX/format-strings-0x.cpp
@@ -12,4 +12,16 @@ void f(char **sp, float *fp) {
    scanf("%afoobar", fp);
    printf(nullptr);
    printf(*sp); // expected-warning {{not a string literal}}
+
+  // PR13099
+  printf(
+    R"foobar(%)foobar"
+    R"bazquux(d)bazquux" // expected-warning {{more '%' conversions than data arguments}}
+    R"xyzzy()xyzzy");
+
+  printf(u8"this is %d test", 0); // ok
+  printf(u8R"foo(
+      \u1234\U0010fffe
+      %d)foo" // expected-warning {{more '%' conversions than data arguments}}
+  );
  }
author	Richard Smith <richard-llvm@metafoo.co.uk>
	Wed, 13 Jun 2012 05:37:23 +0000 (05:37 +0000)
committer	Richard Smith <richard-llvm@metafoo.co.uk>
	Wed, 13 Jun 2012 05:37:23 +0000 (05:37 +0000)
lib/AST/Expr.cpp		patch \| blob \| history
lib/Lex/LiteralSupport.cpp		patch \| blob \| history
lib/Sema/SemaChecking.cpp		patch \| blob \| history
test/SemaCXX/format-strings-0x.cpp		patch \| blob \| history