From: Eli Friedman Date: Sat, 11 Feb 2012 05:08:10 +0000 (+0000) Subject: Implement warning for non-wide string literals with an unexpected encoding. Downgrad... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=91359302b822d829afa93c0dadf5f7ce6e19fbc6;p=clang Implement warning for non-wide string literals with an unexpected encoding. Downgrade error for non-wide character literals with an unexpected encoding to a warning for compatibility with gcc and older versions of clang. . git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@150295 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td index 8de3fb3dc9..70f197731e 100644 --- a/include/clang/Basic/DiagnosticLexKinds.td +++ b/include/clang/Basic/DiagnosticLexKinds.td @@ -136,9 +136,16 @@ def err_unsupported_string_concat : Error< "unsupported non-standard concatenation of string literals">; def err_bad_string_encoding : Error< "illegal character encoding in string literal">; +def warn_bad_string_encoding : ExtWarn< + "illegal character encoding in string literal">, + InGroup>; def err_bad_character_encoding : Error< "illegal character encoding in character literal">; - +def warn_bad_character_encoding : ExtWarn< + "illegal character encoding in character literal">, + InGroup>; + + //===----------------------------------------------------------------------===// // PTH Diagnostics //===----------------------------------------------------------------------===// diff --git a/include/clang/Lex/LiteralSupport.h b/include/clang/Lex/LiteralSupport.h index 1aabc63d54..6142f006c5 100644 --- a/include/clang/Lex/LiteralSupport.h +++ b/include/clang/Lex/LiteralSupport.h @@ -199,6 +199,7 @@ public: private: void init(const Token *StringToks, unsigned NumStringToks); bool CopyStringFragment(StringRef Fragment); + bool DiagnoseBadString(const Token& Tok); }; } // end namespace clang diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp index a3f97d9ecc..547bd4e0c8 100644 --- a/lib/Lex/LiteralSupport.cpp +++ b/lib/Lex/LiteralSupport.cpp @@ -822,17 +822,32 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, ++begin; } while (begin != end && *begin != '\\'); - uint32_t *tmp_begin = buffer_begin; + char const *tmp_in_start = start; + uint32_t *tmp_out_start = buffer_begin; ConversionResult res = ConvertUTF8toUTF32(reinterpret_cast(&start), reinterpret_cast(begin), &buffer_begin,buffer_end,strictConversion); if (res!=conversionOK) { - PP.Diag(Loc, diag::err_bad_character_encoding); - HadError = true; + // If we see bad encoding for unprefixed character literals, warn and + // simply copy the byte values, for compatibility with gcc and + // older versions of clang. + bool NoErrorOnBadEncoding = isAscii(); + unsigned Msg = diag::err_bad_character_encoding; + if (NoErrorOnBadEncoding) + Msg = diag::warn_bad_character_encoding; + PP.Diag(Loc, Msg); + if (NoErrorOnBadEncoding) { + start = tmp_in_start; + buffer_begin = tmp_out_start; + for ( ; start != begin; ++start, ++buffer_begin) + *buffer_begin = static_cast(*start); + } else { + HadError = true; + } } else { - for (; tmp_begin largest_character_for_kind) { + for (; tmp_out_start largest_character_for_kind) { HadError = true; PP.Diag(Loc, diag::err_character_too_large); } @@ -1097,10 +1112,8 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ // Copy the string over if (CopyStringFragment(StringRef(ThisTokBuf,ThisTokEnd-ThisTokBuf))) { - if (Diags) - Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM), - diag::err_bad_string_encoding); - hadError = true; + if (DiagnoseBadString(StringToks[i])) + hadError = true; } } else { @@ -1131,10 +1144,8 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ // Copy the character span over. if (CopyStringFragment(StringRef(InStart,ThisTokBuf-InStart))) { - if (Diags) - Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM), - diag::err_bad_string_encoding); - hadError = true; + if (DiagnoseBadString(StringToks[i])) + hadError = true; } continue; } @@ -1219,6 +1230,9 @@ bool StringLiteralParser::CopyStringFragment(StringRef Fragment) { ConversionResult result = conversionOK; // Copy the character span over. if (CharByteWidth == 1) { + if (!isLegalUTF8Sequence(reinterpret_cast(Fragment.begin()), + reinterpret_cast(Fragment.end()))) + result = sourceIllegal; memcpy(ResultPtr, Fragment.data(), Fragment.size()); ResultPtr += Fragment.size(); } else if (CharByteWidth == 2) { @@ -1226,7 +1240,7 @@ bool StringLiteralParser::CopyStringFragment(StringRef Fragment) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. UTF16 *targetStart = reinterpret_cast(ResultPtr); - ConversionFlags flags = lenientConversion; + ConversionFlags flags = strictConversion; result = ConvertUTF8toUTF16( &sourceStart,sourceStart + Fragment.size(), &targetStart,targetStart + 2*Fragment.size(),flags); @@ -1237,7 +1251,7 @@ bool StringLiteralParser::CopyStringFragment(StringRef Fragment) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. UTF32 *targetStart = reinterpret_cast(ResultPtr); - ConversionFlags flags = lenientConversion; + ConversionFlags flags = strictConversion; result = ConvertUTF8toUTF32( &sourceStart,sourceStart + Fragment.size(), &targetStart,targetStart + 4*Fragment.size(),flags); @@ -1249,6 +1263,17 @@ bool StringLiteralParser::CopyStringFragment(StringRef Fragment) { return result != conversionOK; } +bool StringLiteralParser::DiagnoseBadString(const Token &Tok) { + // If we see bad encoding for unprefixed string literals, warn and + // simply copy the byte values, for compatibility with gcc and older + // versions of clang. + bool NoErrorOnBadEncoding = isAscii(); + unsigned Msg = NoErrorOnBadEncoding ? diag::warn_bad_string_encoding : + diag::err_bad_string_encoding; + if (Diags) + Diags->Report(FullSourceLoc(Tok.getLocation(), SM), Msg); + return !NoErrorOnBadEncoding; +} /// getOffsetOfStringByte - This function returns the offset of the /// specified byte of the string data represented by Token. This handles diff --git a/test/Lexer/char-literal-encoding-error.c b/test/Lexer/char-literal-encoding-error.c index 833ffcaa25..e752de2920 100644 --- a/test/Lexer/char-literal-encoding-error.c +++ b/test/Lexer/char-literal-encoding-error.c @@ -3,8 +3,13 @@ // This file is encoded using ISO-8859-1 int main() { - 'é'; // expected-error {{illegal character encoding in character literal}} - u'é'; // expected-error {{illegal character encoding in character literal}} - U'é'; // expected-error {{illegal character encoding in character literal}} - L'é'; // expected-error {{illegal character encoding in character literal}} + (void)'é'; // expected-warning {{illegal character encoding in character literal}} + (void)u'é'; // expected-error {{illegal character encoding in character literal}} + (void)U'é'; // expected-error {{illegal character encoding in character literal}} + (void)L'é'; // expected-error {{illegal character encoding in character literal}} + + // For narrow character literals, since there is no error, make sure the + // encoding is correct + static_assert((unsigned char)'é' == 0xE9, ""); // expected-warning {{illegal character encoding in character literal}} + static_assert('éé' == 0xE9E9, ""); // expected-warning {{illegal character encoding in character literal}} expected-warning {{multi-character character constant}} } diff --git a/test/Lexer/string-literal-encoding.c b/test/Lexer/string-literal-encoding.c index b12cfab156..aa7cb73f62 100644 --- a/test/Lexer/string-literal-encoding.c +++ b/test/Lexer/string-literal-encoding.c @@ -12,4 +12,7 @@ void f() { wchar_t const *d = LR"(Àéîõü)"; // expected-error {{illegal character encoding in string literal}} char16_t const *e = uR"(Àéîõü)"; // expected-error {{illegal character encoding in string literal}} char32_t const *f = UR"(Àéîõü)"; // expected-error {{illegal character encoding in string literal}} + + char const *g = "Àéîõü"; // expected-warning {{illegal character encoding in string literal}} + char const *h = u8"Àéîõü"; // expected-error {{illegal character encoding in string literal}} }