[Sema] Handle UTF-8 invalid format string specifiers

author Bruno Cardoso Lopes <bruno.cardoso@gmail.com>

Tue, 29 Mar 2016 17:35:02 +0000 (17:35 +0000)

committer Bruno Cardoso Lopes <bruno.cardoso@gmail.com>

Tue, 29 Mar 2016 17:35:02 +0000 (17:35 +0000)
author Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
Tue, 29 Mar 2016 17:35:02 +0000 (17:35 +0000)
committer Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
Tue, 29 Mar 2016 17:35:02 +0000 (17:35 +0000)
diff --git a/include/clang/Analysis/Analyses/FormatString.h b/include/clang/Analysis/Analyses/FormatString.h

index 4471311a3390a1ef29096d600fb899eb12b27197..a593e9853e6d65acef51fbe71e1ef3946f290de8 100644 (file)
--- a/include/clang/Analysis/Analyses/FormatString.h
+++ b/include/clang/Analysis/Analyses/FormatString.h
@@ -210,6 +210,7 @@ public:
    unsigned getLength() const {
      return EndScanList ? EndScanList - Position : 1;
    }
+  void setEndScanList(const char *pos) { EndScanList = pos; }
  
    bool isIntArg() const { return (kind >= IntArgBeg && kind <= IntArgEnd) ||
      kind == FreeBSDrArg || kind == FreeBSDyArg; }
@@ -413,11 +414,6 @@ public:
    bool isObjCArg() const { return kind >= ObjCBeg && kind <= ObjCEnd; }
    bool isDoubleArg() const { return kind >= DoubleArgBeg &&
                                      kind <= DoubleArgEnd; }
-  unsigned getLength() const {
-      // Conversion specifiers currently only are represented by
-      // single characters, but we be flexible.
-    return 1;
-  }
  
    static bool classof(const analyze_format_string::ConversionSpecifier *CS) {
      return CS->isPrintfKind();
@@ -546,8 +542,6 @@ public:
    ScanfConversionSpecifier(const char *pos, Kind k)
      : ConversionSpecifier(false, pos, k) {}
  
-  void setEndScanList(const char *pos) { EndScanList = pos; }
-
    static bool classof(const analyze_format_string::ConversionSpecifier *CS) {
      return !CS->isPrintfKind();
    }
diff --git a/lib/Analysis/FormatString.cpp b/lib/Analysis/FormatString.cpp

index 1c42ec0e87c197c50f8f1341c39110b387bc4af0..badc71021a12f5ecaa11c61d07c5dd6d0ba3a78d 100644 (file)
--- a/lib/Analysis/FormatString.cpp
+++ b/lib/Analysis/FormatString.cpp
@@ -15,6 +15,7 @@
  #include "FormatStringParsing.h"
  #include "clang/Basic/LangOptions.h"
  #include "clang/Basic/TargetInfo.h"
+#include "llvm/Support/ConvertUTF.h"
  
  using clang::analyze_format_string::ArgType;
  using clang::analyze_format_string::FormatStringHandler;
@@ -260,6 +261,28 @@ clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS,
    return true;
  }
  
+bool clang::analyze_format_string::ParseUTF8InvalidSpecifier(
+    const char *SpecifierBegin, const char *FmtStrEnd, unsigned &Len) {
+  if (SpecifierBegin + 1 >= FmtStrEnd)
+    return false;
+
+  const UTF8 *SB = reinterpret_cast<const UTF8 *>(SpecifierBegin + 1);
+  const UTF8 *SE = reinterpret_cast<const UTF8 *>(FmtStrEnd);
+  const char FirstByte = *SB;
+
+  // If the invalid specifier is a multibyte UTF-8 string, return the
+  // total length accordingly so that the conversion specifier can be
+  // properly updated to reflect a complete UTF-8 specifier.
+  unsigned NumBytes = getNumBytesForUTF8(FirstByte);
+  if (NumBytes == 1)
+    return false;
+  if (SB + NumBytes > SE)
+    return false;
+
+  Len = NumBytes + 1;
+  return true;
+}
+
  //===----------------------------------------------------------------------===//
  // Methods on ArgType.
  //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/FormatStringParsing.h b/lib/Analysis/FormatStringParsing.h

index e1652964b8c24d1be4887bc0a96b8d166f71c530..8463fcec5bf49979414d231643296f35f87f89a7 100644 (file)
--- a/lib/Analysis/FormatStringParsing.h
+++ b/lib/Analysis/FormatStringParsing.h
@@ -46,7 +46,13 @@ bool ParseArgPosition(FormatStringHandler &H,
  /// FormatSpecifier& argument, and false otherwise.
  bool ParseLengthModifier(FormatSpecifier &FS, const char *&Beg, const char *E,
                           const LangOptions &LO, bool IsScanf = false);
-  
+
+/// Returns true if the invalid specifier in \p SpecifierBegin is a UTF-8
+/// string; check that it won't go further than \p FmtStrEnd and write
+/// up the total size in \p Len.
+bool ParseUTF8InvalidSpecifier(const char *SpecifierBegin,
+                               const char *FmtStrEnd, unsigned &Len);
+
  template <typename T> class SpecifierResult {
    T FS;
    const char *Start;
diff --git a/lib/Analysis/PrintfFormatString.cpp b/lib/Analysis/PrintfFormatString.cpp

index f0976bce9720906213d304f23fc3661afbe8391e..fb5df61c5ed12cbbf94bcc075fea6429cb0b592e 100644 (file)
--- a/lib/Analysis/PrintfFormatString.cpp
+++ b/lib/Analysis/PrintfFormatString.cpp
@@ -312,8 +312,13 @@ static PrintfSpecifierResult ParsePrintfSpecifier(FormatStringHandler &H,
      argIndex++;
  
    if (k == ConversionSpecifier::InvalidSpecifier) {
+    unsigned Len = I - Start;
+    if (ParseUTF8InvalidSpecifier(Start, E, Len)) {
+      CS.setEndScanList(Start + Len);
+      FS.setConversionSpecifier(CS);
+    }
      // Assume the conversion takes one argument.
-    return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, I - Start);
+    return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len);
    }
    return PrintfSpecifierResult(Start, FS);
  }
diff --git a/lib/Analysis/ScanfFormatString.cpp b/lib/Analysis/ScanfFormatString.cpp

index d484d8e828cbaa04d034ffc9dd8b7f8aa81dbd20..82b038864c23f087baa863509b23a518b7341f96 100644 (file)
--- a/lib/Analysis/ScanfFormatString.cpp
+++ b/lib/Analysis/ScanfFormatString.cpp
@@ -79,7 +79,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H,
                                                  unsigned &argIndex,
                                                  const LangOptions &LO,
                                                  const TargetInfo &Target) {
-  
+  using namespace clang::analyze_format_string;
    using namespace clang::analyze_scanf;
    const char *I = Beg;
    const char *Start = nullptr;
@@ -210,10 +210,15 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H,
    
    // FIXME: '%' and '*' doesn't make sense.  Issue a warning.
    // FIXME: 'ConsumedSoFar' and '*' doesn't make sense.
-  
+
    if (k == ScanfConversionSpecifier::InvalidSpecifier) {
+    unsigned Len = I - Beg;
+    if (ParseUTF8InvalidSpecifier(Beg, E, Len)) {
+      CS.setEndScanList(Beg + Len);
+      FS.setConversionSpecifier(CS);
+    }
      // Assume the conversion takes one argument.
-    return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, I - Beg);
+    return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, Len);
    }
    return ScanfSpecifierResult(Start, FS);
  }
diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp

index cc261e0596369dddbc8d55956bc4f3f43b2ef21a..062041e3771946f921285bfd1a5c361dc6a6d565 100644 (file)
--- a/lib/Sema/SemaChecking.cpp
+++ b/lib/Sema/SemaChecking.cpp
@@ -36,6 +36,8 @@
  #include "llvm/ADT/STLExtras.h"
  #include "llvm/ADT/SmallBitVector.h"
  #include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/Locale.h"
  #include "llvm/Support/ConvertUTF.h"
  #include "llvm/Support/raw_ostream.h"
  #include <limits>
@@ -3976,12 +3978,41 @@ CheckFormatHandler::HandleInvalidConversionSpecifier(unsigned argIndex,
      // gibberish when trying to match arguments.
      keepGoing = false;
    }
-  
-  EmitFormatDiagnostic(S.PDiag(diag::warn_format_invalid_conversion)
-                         << StringRef(csStart, csLen),
-                       Loc, /*IsStringLocation*/true,
-                       getSpecifierRange(startSpec, specifierLen));
-  
+
+  StringRef Specifier(csStart, csLen);
+
+  // If the specifier in non-printable, it could be the first byte of a UTF-8
+  // sequence. In that case, print the UTF-8 code point. If not, print the byte
+  // hex value.
+  std::string CodePointStr;
+  if (!llvm::sys::locale::isPrint(*csStart)) {
+    UTF32 CodePoint;
+    const UTF8 **B = reinterpret_cast<const UTF8 **>(&csStart);
+    const UTF8 *E =
+        reinterpret_cast<const UTF8 *>(csStart + csLen);
+    ConversionResult Result =
+        llvm::convertUTF8Sequence(B, E, &CodePoint, strictConversion);
+
+    if (Result != conversionOK) {
+      unsigned char FirstChar = *csStart;
+      CodePoint = (UTF32)FirstChar;
+    }
+
+    llvm::raw_string_ostream OS(CodePointStr);
+    if (CodePoint < 256)
+      OS << "\\x" << llvm::format("%02x", CodePoint);
+    else if (CodePoint <= 0xFFFF)
+      OS << "\\u" << llvm::format("%04x", CodePoint);
+    else
+      OS << "\\U" << llvm::format("%08x", CodePoint);
+    OS.flush();
+    Specifier = CodePointStr;
+  }
+
+  EmitFormatDiagnostic(
+      S.PDiag(diag::warn_format_invalid_conversion) << Specifier, Loc,
+      /*IsStringLocation*/ true, getSpecifierRange(startSpec, specifierLen));
+
    return keepGoing;
  }
  
diff --git a/test/Sema/format-strings-scanf.c b/test/Sema/format-strings-scanf.c

index 7a92842b2454183496cd451f95a7f3cf6aff6452..ee2be0e6466f7ad491a4f39f25983ae612a12cac 100644 (file)
--- a/test/Sema/format-strings-scanf.c
+++ b/test/Sema/format-strings-scanf.c
@@ -183,3 +183,11 @@ void check_conditional_literal(char *s, int *i) {
    scanf(i ? "%d" : "%d", i, s); // expected-warning{{data argument not used}}
    scanf(i ? "%s" : "%d", s); // expected-warning{{format specifies type 'int *'}}
  }
+
+void testInvalidNoPrintable(int *a) {
+  scanf("%\u25B9", a); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  scanf("%\xE2\x96\xB9", a); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  scanf("%\U00010348", a); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  scanf("%\xF0\x90\x8D\x88", a); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  scanf("%\xe2", a); // expected-warning {{invalid conversion specifier '\xe2'}}
+}
diff --git a/test/Sema/format-strings.c b/test/Sema/format-strings.c

index 5559710c603554919452341a8ab50727cf9ec7a9..253aa57becddeb6e29498da612636391c9ea2879 100644 (file)
--- a/test/Sema/format-strings.c
+++ b/test/Sema/format-strings.c
@@ -642,6 +642,14 @@ void test_qualifiers(volatile int *vip, const int *cip,
    printf("%n", (cip_t)0); // expected-warning{{format specifies type 'int *' but the argument has type 'cip_t' (aka 'const int *')}}
  }
  
+void testInvalidNoPrintable() {
+  printf("%\u25B9"); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  printf("%\xE2\x96\xB9"); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  printf("%\U00010348"); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  printf("%\xF0\x90\x8D\x88"); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  printf("%\xe2"); // expected-warning {{invalid conversion specifier '\xe2'}}
+}
+
  #pragma GCC diagnostic ignored "-Wformat-nonliteral"
  #pragma GCC diagnostic warning "-Wformat-security"
  // <rdar://problem/14178260>
diff --git a/test/SemaObjC/format-strings-objc.m b/test/SemaObjC/format-strings-objc.m

index a1ebf03f8ef91154259487fff65b62e6084f7ddc..2ac68cd0d203f57e234288054fe7c69cd3bb0671 100644 (file)
--- a/test/SemaObjC/format-strings-objc.m
+++ b/test/SemaObjC/format-strings-objc.m
@@ -265,3 +265,11 @@ void testObjCModifierFlags() {
    NSLog(@"%2$[tt]@ %1$[tt]s", @"Foo", @"Bar"); // expected-warning {{object format flags cannot be used with 's' conversion specifier}}
  }
  
+// Test Objective-C invalid no printable specifiers
+void testObjcInvalidNoPrintable(int *a) {
+  NSLog(@"%\u25B9"); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  NSLog(@"%\xE2\x96\xB9"); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  NSLog(@"%\U00010348"); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  NSLog(@"%\xF0\x90\x8D\x88"); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  NSLog(@"%\xe2"); // expected-warning {{input conversion stopped}} expected-warning {{invalid conversion specifier '\xe2'}}
+}
author	Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
	Tue, 29 Mar 2016 17:35:02 +0000 (17:35 +0000)
committer	Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
	Tue, 29 Mar 2016 17:35:02 +0000 (17:35 +0000)
include/clang/Analysis/Analyses/FormatString.h		patch \| blob \| history
lib/Analysis/FormatString.cpp		patch \| blob \| history
lib/Analysis/FormatStringParsing.h		patch \| blob \| history
lib/Analysis/PrintfFormatString.cpp		patch \| blob \| history
lib/Analysis/ScanfFormatString.cpp		patch \| blob \| history
lib/Sema/SemaChecking.cpp		patch \| blob \| history
test/Sema/format-strings-scanf.c		patch \| blob \| history
test/Sema/format-strings.c		patch \| blob \| history
test/SemaObjC/format-strings-objc.m		patch \| blob \| history