From 26b75c07317a3b50a8a00a1623e3ef38af1d8349 Mon Sep 17 00:00:00 2001 From: Richard Smith <richard-llvm@metafoo.co.uk> Date: Fri, 9 Mar 2012 22:27:51 +0000 Subject: [PATCH] Improve diagnostics for UCNs referring to control characters and members of the basic source character set in C++98. Add -Wc++98-compat diagnostics for same in literals in C++11. Extend such support to cover string literals as well as character literals, and mark N2170 as done. This seems too minor to warrant a release note to me. Let me know if you disagree. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@152444 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/DiagnosticLexKinds.td | 10 ++++ lib/Lex/LiteralSupport.cpp | 65 ++++++++++++++--------- test/CXX/lex/lex.charset/p2-cxx11.cpp | 42 +++++++++++++++ test/CXX/lex/lex.charset/p2-cxx98.cpp | 55 +++++++++++++++++++ test/Sema/ucn-cstring.c | 3 +- test/SemaCXX/cxx98-compat.cpp | 7 +++ www/cxx_status.html | 4 +- 7 files changed, 158 insertions(+), 28 deletions(-) create mode 100644 test/CXX/lex/lex.charset/p2-cxx11.cpp create mode 100644 test/CXX/lex/lex.charset/p2-cxx98.cpp diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td index ba6385b9b9..9f23ff52d3 100644 --- a/include/clang/Basic/DiagnosticLexKinds.td +++ b/include/clang/Basic/DiagnosticLexKinds.td @@ -95,6 +95,16 @@ def err_hex_escape_no_digits : Error<"\\x used with no following hex digits">; def err_ucn_escape_no_digits : Error<"\\u used with no following hex digits">; def err_ucn_escape_invalid : Error<"invalid universal character">; def err_ucn_escape_incomplete : Error<"incomplete universal character name">; +def err_ucn_escape_basic_scs : Error< + "character '%0' cannot be specified by a universal character name">; +def err_ucn_control_character : Error< + "universal character name refers to a control character">; +def warn_cxx98_compat_literal_ucn_escape_basic_scs : Warning< + "specifying character '%0' with a universal character name " + "is incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore; +def warn_cxx98_compat_literal_ucn_control_character : Warning< + "universal character name referring to a control character " + "is incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore; def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">; def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">; def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">; diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp index e0a5ba39d0..ae8157dabf 100644 --- a/lib/Lex/LiteralSupport.cpp +++ b/lib/Lex/LiteralSupport.cpp @@ -179,7 +179,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, /// ProcessUCNEscape - Read the Universal Character Name, check constraints and /// return the UTF32. -static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, +static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, + const char *ThisTokEnd, uint32_t &UcnVal, unsigned short &UcnLen, FullSourceLoc Loc, DiagnosticsEngine *Diags, const LangOptions &Features, @@ -187,8 +188,7 @@ static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, if (!Features.CPlusPlus && !Features.C99 && Diags) Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89); - // Save the beginning of the string (for error diagnostics). - const char *ThisTokBegin = ThisTokBuf; + const char *UcnBegin = ThisTokBuf; // Skip the '\u' char's. ThisTokBuf += 2; @@ -210,31 +210,43 @@ static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, if (UcnLenSave) { if (Diags) { SourceLocation L = - Lexer::AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin, + Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin, Loc.getManager(), Features); - Diags->Report(FullSourceLoc(L, Loc.getManager()), - diag::err_ucn_escape_incomplete); + Diags->Report(L, diag::err_ucn_escape_incomplete); } return false; } + // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2] - bool invalid_ucn = (0xD800<=UcnVal && UcnVal<=0xDFFF) // surrogate codepoints - || 0x10FFFF < UcnVal; // maximum legal UTF32 value + if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints + UcnVal > 0x10FFFF) { // maximum legal UTF32 value + if (Diags) + Diags->Report(Loc, diag::err_ucn_escape_invalid); + return false; + } // C++11 allows UCNs that refer to control characters and basic source // characters inside character and string literals - if (!Features.CPlusPlus0x || !in_char_string_literal) { - if ((UcnVal < 0xa0 && - (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 ))) { // $, @, ` - invalid_ucn = true; + if (UcnVal < 0xa0 && + (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, ` + bool IsError = (!Features.CPlusPlus0x || !in_char_string_literal); + if (Diags) { + SourceLocation UcnBeginLoc = + Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin, + Loc.getManager(), Features); + char BasicSCSChar = UcnVal; + if (UcnVal >= 0x20 && UcnVal < 0x7f) + Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_escape_basic_scs : + diag::warn_cxx98_compat_literal_ucn_escape_basic_scs) + << StringRef(&BasicSCSChar, 1); + else + Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_control_character : + diag::warn_cxx98_compat_literal_ucn_control_character); } + if (IsError) + return false; } - if (invalid_ucn) { - if (Diags) - Diags->Report(Loc, diag::err_ucn_escape_invalid); - return false; - } return true; } @@ -242,7 +254,8 @@ static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of /// StringLiteralParser. When we decide to implement UCN's for identifiers, /// we will likely rework our support for UCN's. -static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, +static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, + const char *ThisTokEnd, char *&ResultBuf, bool &HadError, FullSourceLoc Loc, unsigned CharByteWidth, DiagnosticsEngine *Diags, @@ -250,8 +263,8 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, typedef uint32_t UTF32; UTF32 UcnVal = 0; unsigned short UcnLen = 0; - if (!ProcessUCNEscape(ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, Loc, Diags, - Features)) { + if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, + Loc, Diags, Features, true)) { HadError = 1; return; } @@ -787,6 +800,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, Kind = kind; + const char *TokBegin = begin; + // Skip over wide character determinant. if (Kind != tok::char_constant) { ++begin; @@ -803,7 +818,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, --end; } while (end[-1] != '\''); UDSuffixBuf.assign(end, UDSuffixEnd); - UDSuffixOffset = end - begin + 1; + UDSuffixOffset = end - TokBegin; } // Trim the ending quote. @@ -885,7 +900,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, // Is this a Universal Character Name excape? if (begin[1] == 'u' || begin[1] == 'U') { unsigned short UcnLen = 0; - if (!ProcessUCNEscape(begin, end, *buffer_begin, UcnLen, + if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, FullSourceLoc(Loc, PP.getSourceManager()), &PP.getDiagnostics(), PP.getLangOptions(), true)) @@ -1113,6 +1128,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ continue; } + const char *ThisTokBegin = ThisTokBuf; const char *ThisTokEnd = ThisTokBuf+ThisTokLen; // Remove an optional ud-suffix. @@ -1208,8 +1224,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ } // Is this a Universal Character Name escape? if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { - EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, - hadError, FullSourceLoc(StringToks[i].getLocation(),SM), + EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, + ResultPtr, hadError, + FullSourceLoc(StringToks[i].getLocation(), SM), CharByteWidth, Diags, Features); continue; } diff --git a/test/CXX/lex/lex.charset/p2-cxx11.cpp b/test/CXX/lex/lex.charset/p2-cxx11.cpp new file mode 100644 index 0000000000..b9192cebe6 --- /dev/null +++ b/test/CXX/lex/lex.charset/p2-cxx11.cpp @@ -0,0 +1,42 @@ +// RUN: %clang_cc1 -verify -std=c++11 %s + +char c00 = '\u0000'; // ok +char c01 = '\u0001'; // ok +char c1f = '\u001f'; // ok +char c20 = '\u0020'; // ' ', ok +char c22 = '\u0022'; // ", ok +char c23 = '\u0023'; // #, ok +char c24 = '\u0024'; // $, ok +char c25 = '\u0025'; // %, ok +char c27 = '\u0027'; // ', ok +char c3f = '\u003f'; // ?, ok +char c40 = '\u0040'; // @, ok +char c41 = '\u0041'; // A, ok +char c5f = '\u005f'; // _, ok +char c60 = '\u0060'; // `, ok +char c7e = '\u007e'; // ~, ok +char c7f = '\u007f'; // ok + +wchar_t w007f = L'\u007f'; +wchar_t w0080 = L'\u0080'; +wchar_t w009f = L'\u009f'; +wchar_t w00a0 = L'\u00a0'; + +wchar_t wd799 = L'\ud799'; +wchar_t wd800 = L'\ud800'; // expected-error {{invalid universal character}} +wchar_t wdfff = L'\udfff'; // expected-error {{invalid universal character}} +wchar_t we000 = L'\ue000'; + +char32_t w10fffe = U'\U0010fffe'; +char32_t w10ffff = U'\U0010ffff'; +char32_t w110000 = U'\U00110000'; // expected-error {{invalid universal character}} + +const char *p1 = "\u0000\u0001\u001f\u0020\u0022\u0023\u0024\u0025\u0027\u003f\u0040\u0041\u005f\u0060\u007e\u007f"; +const wchar_t *p2 = L"\u0000\u0012\u004e\u007f\u0080\u009f\u00a0\ud799\ue000"; +const char *p3 = u8"\u0000\u0012\u004e\u007f\u0080\u009f\u00a0\ud799\ue000"; +const char16_t *p4 = u"\u0000\u0012\u004e\u007f\u0080\u009f\u00a0\ud799\ue000"; +const char32_t *p5 = U"\u0000\u0012\u004e\u007f\u0080\u009f\u00a0\ud799\ue000"; +const wchar_t *p6 = L"foo \U00110000 bar"; // expected-error {{invalid universal character}} +const char *p7 = u8"foo \U0000d800 bar"; // expected-error {{invalid universal character}} +const char16_t *p8 = u"foo \U0000dfff bar"; // expected-error {{invalid universal character}} +const char32_t *p9 = U"foo \U0010ffff bar"; // ok diff --git a/test/CXX/lex/lex.charset/p2-cxx98.cpp b/test/CXX/lex/lex.charset/p2-cxx98.cpp new file mode 100644 index 0000000000..a5b7ab6488 --- /dev/null +++ b/test/CXX/lex/lex.charset/p2-cxx98.cpp @@ -0,0 +1,55 @@ +// RUN: %clang_cc1 -verify -std=c++98 %s + +char c00 = '\u0000'; // expected-error {{universal character name refers to a control character}} +char c01 = '\u0001'; // expected-error {{universal character name refers to a control character}} +char c1f = '\u001f'; // expected-error {{universal character name refers to a control character}} +char c20 = '\u0020'; // ' ', expected-error {{character ' ' cannot be specified by a universal character name}} +char c22 = '\u0022'; // ", expected-error {{character '"' cannot be specified by a universal character name}} +char c23 = '\u0023'; // #, expected-error {{character '#' cannot be specified by a universal character name}} +char c24 = '\u0024'; // $, ok +char c25 = '\u0025'; // %, expected-error {{character '%' cannot be specified by a universal character name}} +char c27 = '\u0027'; // ', expected-error {{character ''' cannot be specified by a universal character name}} +char c3f = '\u003f'; // ?, expected-error {{character '?' cannot be specified by a universal character name}} +char c40 = '\u0040'; // @, ok +char c41 = '\u0041'; // A, expected-error {{character 'A' cannot be specified by a universal character name}} +char c5f = '\u005f'; // _, expected-error {{character '_' cannot be specified by a universal character name}} +char c60 = '\u0060'; // `, ok +char c7e = '\u007e'; // ~, expected-error {{character '~' cannot be specified by a universal character name}} +char c7f = '\u007f'; // expected-error {{universal character name refers to a control character}} + +wchar_t w007f = L'\u007f'; // expected-error {{universal character name refers to a control character}} +wchar_t w0080 = L'\u0080'; // expected-error {{universal character name refers to a control character}} +wchar_t w009f = L'\u009f'; // expected-error {{universal character name refers to a control character}} +wchar_t w00a0 = L'\u00a0'; + +wchar_t wd799 = L'\ud799'; +wchar_t wd800 = L'\ud800'; // expected-error {{invalid universal character}} +wchar_t wdfff = L'\udfff'; // expected-error {{invalid universal character}} +wchar_t we000 = L'\ue000'; + +const char *s00 = "\u0000"; // expected-error {{universal character name refers to a control character}} +const char *s01 = "\u0001"; // expected-error {{universal character name refers to a control character}} +const char *s1f = "\u001f"; // expected-error {{universal character name refers to a control character}} +const char *s20 = "\u0020"; // ' ', expected-error {{character ' ' cannot be specified by a universal character name}} +const char *s22 = "\u0022"; // ", expected-error {{character '"' cannot be specified by a universal character name}} +const char *s23 = "\u0023"; // #, expected-error {{character '#' cannot be specified by a universal character name}} +const char *s24 = "\u0024"; // $, ok +const char *s25 = "\u0025"; // %, expected-error {{character '%' cannot be specified by a universal character name}} +const char *s27 = "\u0027"; // ', expected-error {{character ''' cannot be specified by a universal character name}} +const char *s3f = "\u003f"; // ?, expected-error {{character '?' cannot be specified by a universal character name}} +const char *s40 = "\u0040"; // @, ok +const char *s41 = "\u0041"; // A, expected-error {{character 'A' cannot be specified by a universal character name}} +const char *s5f = "\u005f"; // _, expected-error {{character '_' cannot be specified by a universal character name}} +const char *s60 = "\u0060"; // `, ok +const char *s7e = "\u007e"; // ~, expected-error {{character '~' cannot be specified by a universal character name}} +const char *s7f = "\u007f"; // expected-error {{universal character name refers to a control character}} + +const wchar_t *ws007f = L"\u007f"; // expected-error {{universal character name refers to a control character}} +const wchar_t *ws0080 = L"\u0080"; // expected-error {{universal character name refers to a control character}} +const wchar_t *ws009f = L"\u009f"; // expected-error {{universal character name refers to a control character}} +const wchar_t *ws00a0 = L"\u00a0"; + +const wchar_t *wsd799 = L"\ud799"; +const wchar_t *wsd800 = L"\ud800"; // expected-error {{invalid universal character}} +const wchar_t *wsdfff = L"\udfff"; // expected-error {{invalid universal character}} +const wchar_t *wse000 = L"\ue000"; diff --git a/test/Sema/ucn-cstring.c b/test/Sema/ucn-cstring.c index ac1d37f186..5d3e85dae7 100644 --- a/test/Sema/ucn-cstring.c +++ b/test/Sema/ucn-cstring.c @@ -11,7 +11,6 @@ int main(void) { printf("%s\n", "\U"); // expected-error{{\u used with no following hex digits}} printf("%s\n", "\U00"); // expected-error{{incomplete universal character name}} printf("%s\n", "\U0001"); // expected-error{{incomplete universal character name}} - printf("%s\n", "\u0001"); // expected-error{{invalid universal character}} + printf("%s\n", "\u0001"); // expected-error{{universal character name refers to a control character}} return 0; } - diff --git a/test/SemaCXX/cxx98-compat.cpp b/test/SemaCXX/cxx98-compat.cpp index e9ba0dffc3..903932c189 100644 --- a/test/SemaCXX/cxx98-compat.cpp +++ b/test/SemaCXX/cxx98-compat.cpp @@ -281,3 +281,10 @@ namespace UnevaluatedMemberAccess { int k = sizeof(S::n); // expected-warning {{use of non-static data member 'n' in an unevaluated context is incompatible with C++98}} const std::type_info &ti = typeid(S::n); // expected-warning {{use of non-static data member 'n' in an unevaluated context is incompatible with C++98}} } + +namespace LiteralUCNs { + char c1 = '\u001e'; // expected-warning {{universal character name referring to a control character is incompatible with C++98}} + wchar_t c2 = L'\u0041'; // expected-warning {{specifying character 'A' with a universal character name is incompatible with C++98}} + const char *s1 = "foo\u0031"; // expected-warning {{specifying character '1' with a universal character name is incompatible with C++98}} + const wchar_t *s2 = L"bar\u0085"; // expected-warning {{universal character name referring to a control character is incompatible with C++98}} +} diff --git a/www/cxx_status.html b/www/cxx_status.html index fad14f1c0d..9cc82bad1a 100644 --- a/www/cxx_status.html +++ b/www/cxx_status.html @@ -206,9 +206,9 @@ with clang; other versions have not been tested.</p> <td class="full" align="center">Clang 3.0</td> </tr> <tr> - <td>Universal character name literals</td> + <td>Universal character names in literals</td> <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2170.html">N2170</a></td> - <td class="none" align="center">No</td> + <td class="svn" align="center">SVN</td> </tr> <tr> <td>User-defined literals</td> -- 2.40.0