From: Ted Kremenek Date: Tue, 14 Aug 2007 17:39:48 +0000 (+0000) Subject: Added support for additional format string checking for the printf X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=71895b9aa3ad71957359497e136b50fcb6136bdf;p=clang Added support for additional format string checking for the printf family of functions. Previous functionality only included checking to see if the format string was a string literal. Now we check parse the format string (if it is a literal) and perform the following checks: (1) Warn if: number conversions (e.g. "%d") != number data arguments. (2) Warn about missing format strings (e.g., "printf()"). (3) Warn if the format string is not a string literal. (4) Warn about the use se of '%n' conversion. This conversion is discouraged for security reasons. (5) Warn about malformed conversions. For example '%;', '%v'; these are not valid. (6) Warn about empty format strings; e.g. printf(""). Although these can be optimized away by the compiler, they can be indicative of broken programmer logic. We may need to add additional support to see when such cases occur within macro expansion to avoid false positives. (7) Warn if the string literal is wide; e.g. L"%d". (8) Warn if we detect a '\0' character WITHIN the format string. Test cases are included. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@41076 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/Sema/Sema.h b/Sema/Sema.h index d373cc0400..22fffc7299 100644 --- a/Sema/Sema.h +++ b/Sema/Sema.h @@ -423,10 +423,14 @@ private: // Extra semantic analysis beyond the C type system private: - void CheckFunctionCall(Expr *Fn, FunctionDecl *FDecl, + void CheckFunctionCall(Expr *Fn, + SourceLocation LParenLoc, SourceLocation RParenLoc, + FunctionDecl *FDecl, Expr** Args, unsigned NumArgsInCall); - void CheckPrintfArguments(Expr *Fn, unsigned id_idx, FunctionDecl *FDecl, + void CheckPrintfArguments(Expr *Fn, + SourceLocation LParenLoc, SourceLocation RParenLoc, + bool HasVAListArg, FunctionDecl *FDecl, unsigned format_idx, Expr** Args, unsigned NumArgsInCall); }; diff --git a/Sema/SemaChecking.cpp b/Sema/SemaChecking.cpp index 8cc3c6c147..511f56f2a8 100644 --- a/Sema/SemaChecking.cpp +++ b/Sema/SemaChecking.cpp @@ -29,7 +29,9 @@ using namespace clang; /// CheckFunctionCall - Check a direct function call for various correctness /// and safety properties not strictly enforced by the C type system. void -Sema::CheckFunctionCall(Expr *Fn, FunctionDecl *FDecl, +Sema::CheckFunctionCall(Expr *Fn, + SourceLocation LParenLoc, SourceLocation RParenLoc, + FunctionDecl *FDecl, Expr** Args, unsigned NumArgsInCall) { // Get the IdentifierInfo* for the called function. @@ -37,55 +39,287 @@ Sema::CheckFunctionCall(Expr *Fn, FunctionDecl *FDecl, // Search the KnownFunctionIDs for the identifier. unsigned i = 0, e = id_num_known_functions; - for ( ; i != e; ++i) { if (KnownFunctionIDs[i] == FnInfo) break; } - if( i == e ) return; + for (; i != e; ++i) { if (KnownFunctionIDs[i] == FnInfo) break; } + if (i == e) return; // Printf checking. if (i <= id_vprintf) { - // Retrieve the index of the format string parameter. + // Retrieve the index of the format string parameter and determine + // if the function is passed a va_arg argument. unsigned format_idx = 0; + bool HasVAListArg = false; + switch (i) { default: assert(false && "No format string argument index."); case id_printf: format_idx = 0; break; case id_fprintf: format_idx = 1; break; case id_sprintf: format_idx = 1; break; case id_snprintf: format_idx = 2; break; - case id_vsnprintf: format_idx = 2; break; - case id_asprintf: format_idx = 1; break; - case id_vasprintf: format_idx = 1; break; - case id_vfprintf: format_idx = 1; break; - case id_vsprintf: format_idx = 1; break; - case id_vprintf: format_idx = 1; break; - } - CheckPrintfArguments(Fn, i, FDecl, format_idx, Args, NumArgsInCall); + case id_asprintf: format_idx = 1; HasVAListArg = true; break; + case id_vsnprintf: format_idx = 2; HasVAListArg = true; break; + case id_vasprintf: format_idx = 1; HasVAListArg = true; break; + case id_vfprintf: format_idx = 1; HasVAListArg = true; break; + case id_vsprintf: format_idx = 1; HasVAListArg = true; break; + case id_vprintf: format_idx = 0; HasVAListArg = true; break; + } + + CheckPrintfArguments(Fn, LParenLoc, RParenLoc, HasVAListArg, + FDecl, format_idx, Args, NumArgsInCall); } } /// CheckPrintfArguments - Check calls to printf (and similar functions) for -/// correct use of format strings. Improper format strings to functions in -/// the printf family can be the source of bizarre bugs and very serious -/// security holes. A good source of information is available in the following -/// paper (which includes additional references): +/// correct use of format strings. +/// +/// HasVAListArg - A predicate indicating whether the printf-like +/// function is passed an explicit va_arg argument (e.g., vprintf) +/// +/// format_idx - The index into Args for the format string. +/// +/// Improper format strings to functions in the printf family can be +/// the source of bizarre bugs and very serious security holes. A +/// good source of information is available in the following paper +/// (which includes additional references): /// /// FormatGuard: Automatic Protection From printf Format String /// Vulnerabilities, Proceedings of the 10th USENIX Security Symposium, 2001. +/// +/// Functionality implemented: +/// +/// We can statically check the following properties for string +/// literal format strings for non v.*printf functions (where the +/// arguments are passed directly): +// +/// (1) Are the number of format conversions equal to the number of +/// data arguments? +/// +/// (2) Does each format conversion correctly match the type of the +/// corresponding data argument? (TODO) +/// +/// Moreover, for all printf functions we can: +/// +/// (3) Check for a missing format string (when not caught by type checking). +/// +/// (4) Check for no-operation flags; e.g. using "#" with format +/// conversion 'c' (TODO) +/// +/// (5) Check the use of '%n', a major source of security holes. +/// +/// (6) Check for malformed format conversions that don't specify anything. +/// +/// (7) Check for empty format strings. e.g: printf(""); +/// +/// (8) Check that the format string is a wide literal. +/// +/// All of these checks can be done by parsing the format string. +/// +/// For now, we ONLY do (1), (3), (5), (6), (7), and (8). void -Sema::CheckPrintfArguments(Expr *Fn, unsigned id_idx, FunctionDecl *FDecl, +Sema::CheckPrintfArguments(Expr *Fn, + SourceLocation LParenLoc, SourceLocation RParenLoc, + bool HasVAListArg, FunctionDecl *FDecl, unsigned format_idx, Expr** Args, unsigned NumArgsInCall) { - - assert( format_idx < NumArgsInCall ); - + // CHECK: printf-like function is called with no format string. + if (format_idx >= NumArgsInCall) { + Diag(RParenLoc, diag::warn_printf_missing_format_string, + Fn->getSourceRange()); + return; + } + // CHECK: format string is not a string literal. // - // Dynamically generated format strings are difficult to automatically - // vet at compile time. Requiring that format strings are string literals - // (1) permits the checking of format strings by the compiler and thereby - // (2) can practically remove the source of many format string exploits. - + // Dynamically generated format strings are difficult to + // automatically vet at compile time. Requiring that format strings + // are string literals: (1) permits the checking of format strings by + // the compiler and thereby (2) can practically remove the source of + // many format string exploits. StringLiteral *FExpr = dyn_cast(Args[format_idx]); - if ( FExpr == NULL ) - Diag( Args[format_idx]->getLocStart(), - diag::warn_printf_not_string_constant, Fn->getSourceRange() ); -} \ No newline at end of file + if (FExpr == NULL) { + Diag(Args[format_idx]->getLocStart(), + diag::warn_printf_not_string_constant, Fn->getSourceRange()); + return; + } + + // CHECK: is the format string a wide literal? + if (FExpr->isWide()) { + Diag(Args[format_idx]->getLocStart(), + diag::warn_printf_format_string_is_wide_literal, + Fn->getSourceRange()); + return; + } + + // Str - The format string. NOTE: this is NOT null-terminated! + const char * const Str = FExpr->getStrData(); + + // CHECK: empty format string? + const unsigned StrLen = FExpr->getByteLength(); + + if (StrLen == 0) { + Diag(Args[format_idx]->getLocStart(), + diag::warn_printf_empty_format_string, Fn->getSourceRange()); + return; + } + + // We process the format string using a binary state machine. The + // current state is stored in CurrentState. + enum { + state_OrdChr, + state_Conversion + } CurrentState = state_OrdChr; + + // numConversions - The number of conversions seen so far. This is + // incremented as we traverse the format string. + unsigned numConversions = 0; + + // numDataArgs - The number of data arguments after the format + // string. This can only be determined for non vprintf-like + // functions. For those functions, this value is 1 (the sole + // va_arg argument). + unsigned numDataArgs = NumArgsInCall-(format_idx+1); + + // Inspect the format string. + unsigned StrIdx = 0; + + // LastConversionIdx - Index within the format string where we last saw + // a '%' character that starts a new format conversion. + unsigned LastConversionIdx = 0; + + for ( ; StrIdx < StrLen ; ++StrIdx ) { + + // Is the number of detected conversion conversions greater than + // the number of matching data arguments? If so, stop. + if (!HasVAListArg && numConversions > numDataArgs) break; + + // Handle "\0" + if(Str[StrIdx] == '\0' ) { + // The string returned by getStrData() is not null-terminated, + // so the presence of a null character is likely an error. + + SourceLocation Loc = + PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),StrIdx+1); + + Diag(Loc, diag::warn_printf_format_string_contains_null_char, + Fn->getSourceRange()); + + return; + } + + // Ordinary characters (not processing a format conversion). + if (CurrentState == state_OrdChr) { + if (Str[StrIdx] == '%') { + CurrentState = state_Conversion; + LastConversionIdx = StrIdx; + } + continue; + } + + // Seen '%'. Now processing a format conversion. + switch (Str[StrIdx]) { + // Characters which can terminate a format conversion + // (e.g. "%d"). Characters that specify length modifiers or + // other flags are handled by the default case below. + // + // TODO: additional checks will go into the following cases. + case 'i': + case 'd': + case 'o': + case 'u': + case 'x': + case 'X': + case 'D': + case 'O': + case 'U': + case 'e': + case 'E': + case 'f': + case 'F': + case 'g': + case 'G': + case 'a': + case 'A': + case 'c': + case 'C': + case 'S': + case 's': + case 'P': + ++numConversions; + CurrentState = state_OrdChr; + break; + + // CHECK: Are we using "%n"? Issue a warning. + case 'n': { + ++numConversions; + CurrentState = state_OrdChr; + SourceLocation Loc = + PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(), + LastConversionIdx+1); + + Diag(Loc, diag::warn_printf_write_back, Fn->getSourceRange()); + break; + } + + // Handle "%%" + case '%': + // Sanity check: Was the first "%" character the previous one? + // If not, we will assume that we have a malformed format + // conversion, and that the current "%" character is the start + // of a new conversion. + if (StrIdx - LastConversionIdx == 1) + CurrentState = state_OrdChr; + else { + // Issue a warning: invalid format conversion. + SourceLocation Loc = + PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(), + LastConversionIdx+1); + + Diag(Loc, diag::warn_printf_invalid_conversion, + std::string(Str+LastConversionIdx, Str+StrIdx), + Fn->getSourceRange()); + + // This conversion is broken. Advance to the next format + // conversion. + LastConversionIdx = StrIdx; + ++numConversions; + } + + break; + + default: + // This case catches all other characters: flags, widths, etc. + // We should eventually process those as well. + break; + } + } + + if (CurrentState == state_Conversion) { + // Issue a warning: invalid format conversion. + SourceLocation Loc = + PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(), + LastConversionIdx+1); + + Diag(Loc, diag::warn_printf_invalid_conversion, + std::string(Str+LastConversionIdx, Str+StrIdx), + Fn->getSourceRange()); + return; + } + + if (!HasVAListArg) { + // CHECK: Does the number of format conversions exceed the number + // of data arguments? + if (numConversions > numDataArgs) { + SourceLocation Loc = + PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(), + LastConversionIdx); + + Diag(Loc, diag::warn_printf_insufficient_data_args, + Fn->getSourceRange()); + } + // CHECK: Does the number of data arguments exceed the number of + // format conversions in the format string? + else if (numConversions < numDataArgs) + Diag(Args[format_idx+numConversions+1]->getLocStart(), + diag::warn_printf_too_many_data_args, Fn->getSourceRange()); + } +} diff --git a/Sema/SemaExpr.cpp b/Sema/SemaExpr.cpp index 6b871b4812..f69867fdf1 100644 --- a/Sema/SemaExpr.cpp +++ b/Sema/SemaExpr.cpp @@ -561,7 +561,7 @@ ParseCallExpr(ExprTy *fn, SourceLocation LParenLoc, if (ImplicitCastExpr *IcExpr = dyn_cast(Fn)) if (DeclRefExpr *DRExpr = dyn_cast(IcExpr->getSubExpr())) if (FunctionDecl *FDecl = dyn_cast(DRExpr->getDecl())) - CheckFunctionCall(Fn, FDecl, Args, NumArgsInCall); + CheckFunctionCall(Fn, LParenLoc, RParenLoc, FDecl, Args, NumArgsInCall); return new CallExpr(Fn, Args, NumArgsInCall, resultType, RParenLoc); } diff --git a/include/clang/Basic/DiagnosticKinds.def b/include/clang/Basic/DiagnosticKinds.def index 041f5537dd..9137a8419f 100644 --- a/include/clang/Basic/DiagnosticKinds.def +++ b/include/clang/Basic/DiagnosticKinds.def @@ -661,8 +661,24 @@ DIAG(warn_unused_expr, WARNING, // Printf checking DIAG(warn_printf_not_string_constant, WARNING, - "format string is not a string literal") - + "format string is not a string literal (potentially insecure)") +DIAG(warn_printf_write_back, WARNING, + "use of '%n' in format string discouraged (potentially insecure)") +DIAG(warn_printf_insufficient_data_args, WARNING, + "more '%' conversions than data arguments") +DIAG(warn_printf_too_many_data_args, WARNING, + "more data arguments than '%' conversions") +DIAG(warn_printf_invalid_conversion, WARNING, + "invalid conversion '%0'") +DIAG(warn_printf_missing_format_string, WARNING, + "format string missing") +DIAG(warn_printf_empty_format_string, WARNING, + "format string is empty") +DIAG(warn_printf_format_string_is_wide_literal, WARNING, + "format string should not be a wide string") +DIAG(warn_printf_format_string_contains_null_char, WARNING, + "format string contains '\\0' within the string body") + // Statements. DIAG(err_continue_not_in_loop, ERROR, "'continue' statement not in loop statement") diff --git a/test/Sema/format-strings.c b/test/Sema/format-strings.c index f71cd58645..403da07b0f 100644 --- a/test/Sema/format-strings.c +++ b/test/Sema/format-strings.c @@ -21,3 +21,44 @@ void check_string_literal( FILE* fp, const char* s, char *buf, ... ) { vsnprintf(buf,2,s,ap); // expected-warning {{mat string is not a string lit}} } +void check_writeback_specifier() +{ + int x; + char *b; + + printf("%n",&x); // expected-warning {{'%n' in format string discouraged}} + sprintf(b,"%d%%%n",1, &x); // expected-warning {{'%n' in format string dis}} +} + +void check_invalid_specifier(FILE* fp, char *buf) +{ + printf("%s%lb%d","unix",10,20); // expected-warning {{lid conversion '%lb'}} + fprintf(fp,"%%%l"); // expected-warning {{lid conversion '%l'}} + sprintf(buf,"%%%%%ld%d%d", 1, 2, 3); // no-warning + snprintf(buf, 2, "%%%%%ld%;%d", 1, 2, 3); // expected-warning {{sion '%;'}} +} + +void check_null_char_string(char* b) +{ + printf("\0this is bogus%d",1); // expected-warning {{string contains '\0'}} + snprintf(b,10,"%%%%%d\0%d",1,2); // expected-warning {{string contains '\0'}} + printf("%\0d",1); // expected-warning {{string contains '\0'}} +} + +void check_empty_format_string(char* buf) +{ + va_list ap; + va_start(ap,buf); + vprintf("",ap); // expected-warning {{format string is empty}} + sprintf(buf,""); // expected-warning {{format string is empty}} +} + +void check_wide_string() +{ + char *b; + va_list ap; + va_start(ap,b); + + printf(L"foo %d",2); // expected-warning {{should not be a wide string}} + vasprintf(&b,L"bar %d",2); // expected-warning {{should not be a wide string}} +}