From 40de650390d05bea9a0422cf220dde66a0da048a Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Mon, 17 Feb 2014 21:52:30 +0000 Subject: [PATCH] PR18855: Add support for UCNs and UTF-8 encoding within ud-suffixes. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@201532 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Lex/Lexer.h | 22 ++- include/clang/Lex/LiteralSupport.h | 7 +- lib/Lex/Lexer.cpp | 150 ++++++++++++-------- lib/Lex/LiteralSupport.cpp | 79 +++++++++-- lib/Lex/Preprocessor.cpp | 42 ------ test/Parser/cxx11-user-defined-literals.cpp | 32 +++++ 6 files changed, 216 insertions(+), 116 deletions(-) diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h index f456fa9cd5..af6d8cf750 100644 --- a/include/clang/Lex/Lexer.h +++ b/include/clang/Lex/Lexer.h @@ -614,8 +614,28 @@ private: /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is /// invalid. uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok); -}; + /// \brief Try to consume a UCN as part of an identifier at the current + /// location. + /// \param CurPtr Initially points to the range of characters in the source + /// buffer containing the '\'. Updated to point past the end of + /// the UCN on success. + /// \param Size The number of characters occupied by the '\' (including + /// trigraphs and escaped newlines). + /// \param Result The token being produced. Marked as containing a UCN on + /// success. + /// \return \c true if a UCN was lexed and it produced an acceptable + /// identifier character, \c false otherwise. + bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, + Token &Result); + + /// \brief Try to consume an identifier character encoded in UTF-8. + /// \param CurPtr Points to the start of the (potential) UTF-8 code unit + /// sequence. On success, updated to point past the end of it. + /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier + /// character was lexed, \c false otherwise. + bool tryConsumeIdentifierUTF8Char(const char *&CurPtr); +}; } // end namespace clang diff --git a/include/clang/Lex/LiteralSupport.h b/include/clang/Lex/LiteralSupport.h index 64d5aa2d59..3e52418c0e 100644 --- a/include/clang/Lex/LiteralSupport.h +++ b/include/clang/Lex/LiteralSupport.h @@ -33,6 +33,9 @@ class TargetInfo; class SourceManager; class LangOptions; +/// Copy characters from Input to Buf, expanding any UCNs. +void expandUCNs(SmallVectorImpl &Buf, StringRef Input); + /// NumericLiteralParser - This performs strict semantic analysis of the content /// of a ppnumber, classifying it as either integer, floating, or erroneous, /// determines the radix of the value and can convert it to a useful value. @@ -48,6 +51,8 @@ class NumericLiteralParser { bool saw_exponent, saw_period, saw_ud_suffix; + SmallString<32> UDSuffixBuf; + public: NumericLiteralParser(StringRef TokSpelling, SourceLocation TokLoc, @@ -72,7 +77,7 @@ public: } StringRef getUDSuffix() const { assert(saw_ud_suffix); - return StringRef(SuffixBegin, ThisTokEnd - SuffixBegin); + return UDSuffixBuf; } unsigned getUDSuffixOffset() const { assert(saw_ud_suffix); diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index c7eab490ad..cfa835d173 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -1445,7 +1445,50 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, << Range; } } - } +} + +bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, + Token &Result) { + const char *UCNPtr = CurPtr + Size; + uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0); + if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts)) + return false; + + if (!isLexingRawMode()) + maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CurPtr, UCNPtr), + /*IsFirst=*/false); + + Result.setFlag(Token::HasUCN); + if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || + (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) + CurPtr = UCNPtr; + else + while (CurPtr != UCNPtr) + (void)getAndAdvanceChar(CurPtr, Result); + return true; +} + +bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { + const char *UnicodePtr = CurPtr; + UTF32 CodePoint; + ConversionResult Result = + llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr, + (const UTF8 *)BufferEnd, + &CodePoint, + strictConversion); + if (Result != conversionOK || + !isAllowedIDChar(static_cast(CodePoint), LangOpts)) + return false; + + if (!isLexingRawMode()) + maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CurPtr, UnicodePtr), + /*IsFirst=*/false); + + CurPtr = UnicodePtr; + return true; +} bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) { // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] @@ -1500,47 +1543,10 @@ FinishIdentifier: C = getCharAndSize(CurPtr, Size); continue; - } else if (C == '\\') { - const char *UCNPtr = CurPtr + Size; - uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0); - if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts)) - goto FinishIdentifier; - - if (!isLexingRawMode()) { - maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, - makeCharRange(*this, CurPtr, UCNPtr), - /*IsFirst=*/false); - } - - Result.setFlag(Token::HasUCN); - if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || - (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) - CurPtr = UCNPtr; - else - while (CurPtr != UCNPtr) - (void)getAndAdvanceChar(CurPtr, Result); - + } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { C = getCharAndSize(CurPtr, Size); continue; - } else if (!isASCII(C)) { - const char *UnicodePtr = CurPtr; - UTF32 CodePoint; - ConversionResult Result = - llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr, - (const UTF8 *)BufferEnd, - &CodePoint, - strictConversion); - if (Result != conversionOK || - !isAllowedIDChar(static_cast(CodePoint), LangOpts)) - goto FinishIdentifier; - - if (!isLexingRawMode()) { - maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, - makeCharRange(*this, CurPtr, UnicodePtr), - /*IsFirst=*/false); - } - - CurPtr = UnicodePtr; + } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { C = getCharAndSize(CurPtr, Size); continue; } else if (!isIdentifierBody(C)) { @@ -1576,7 +1582,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { unsigned Size; char C = getCharAndSize(CurPtr, Size); char PrevCh = 0; - while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix. + while (isPreprocessingNumberBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); PrevCh = C; C = getCharAndSize(CurPtr, Size); @@ -1618,6 +1624,12 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { } } + // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. + if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) + return LexNumericConstant(Result, CurPtr); + if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) + return LexNumericConstant(Result, CurPtr); + // Update the location of token as well as BufferPtr. const char *TokStart = BufferPtr; FormTokenWithChars(Result, CurPtr, tok::numeric_constant); @@ -1631,23 +1643,35 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, bool IsStringLiteral) { assert(getLangOpts().CPlusPlus); - // Maximally munch an identifier. FIXME: UCNs. + // Maximally munch an identifier. unsigned Size; char C = getCharAndSize(CurPtr, Size); - if (isIdentifierHead(C)) { - if (!getLangOpts().CPlusPlus11) { - if (!isLexingRawMode()) - Diag(CurPtr, - C == '_' ? diag::warn_cxx11_compat_user_defined_literal - : diag::warn_cxx11_compat_reserved_user_defined_literal) - << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); + bool Consumed = false; + + if (!isIdentifierHead(C)) { + if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) + Consumed = true; + else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) + Consumed = true; + else return CurPtr; - } + } - // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix - // that does not start with an underscore is ill-formed. As a conforming - // extension, we treat all such suffixes as if they had whitespace before - // them. + if (!getLangOpts().CPlusPlus11) { + if (!isLexingRawMode()) + Diag(CurPtr, + C == '_' ? diag::warn_cxx11_compat_user_defined_literal + : diag::warn_cxx11_compat_reserved_user_defined_literal) + << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); + return CurPtr; + } + + // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix + // that does not start with an underscore is ill-formed. As a conforming + // extension, we treat all such suffixes as if they had whitespace before + // them. We assume a suffix beginning with a UCN or UTF-8 character is more + // likely to be a ud-suffix than a macro, however, and accept that. + if (!Consumed) { bool IsUDSuffix = false; if (C == '_') IsUDSuffix = true; @@ -1685,16 +1709,22 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, Diag(CurPtr, getLangOpts().MSVCCompat ? diag::ext_ms_reserved_user_defined_literal : diag::ext_reserved_user_defined_literal) - << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); + << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); return CurPtr; } - Result.setFlag(Token::HasUDSuffix); - do { - CurPtr = ConsumeChar(CurPtr, Size, Result); - C = getCharAndSize(CurPtr, Size); - } while (isIdentifierBody(C)); + CurPtr = ConsumeChar(CurPtr, Size, Result); } + + Result.setFlag(Token::HasUDSuffix); + while (true) { + C = getCharAndSize(CurPtr, Size); + if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); } + else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {} + else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {} + else break; + } + return CurPtr; } diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp index 17c6bb3049..a71518184c 100644 --- a/lib/Lex/LiteralSupport.cpp +++ b/lib/Lex/LiteralSupport.cpp @@ -212,6 +212,48 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, return ResultChar; } +static void appendCodePoint(unsigned Codepoint, + llvm::SmallVectorImpl &Str) { + char ResultBuf[4]; + char *ResultPtr = ResultBuf; + bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr); + (void)Res; + assert(Res && "Unexpected conversion failure"); + Str.append(ResultBuf, ResultPtr); +} + +void clang::expandUCNs(SmallVectorImpl &Buf, StringRef Input) { + for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) { + if (*I != '\\') { + Buf.push_back(*I); + continue; + } + + ++I; + assert(*I == 'u' || *I == 'U'); + + unsigned NumHexDigits; + if (*I == 'u') + NumHexDigits = 4; + else + NumHexDigits = 8; + + assert(I + NumHexDigits <= E); + + uint32_t CodePoint = 0; + for (++I; NumHexDigits != 0; ++I, --NumHexDigits) { + unsigned Value = llvm::hexDigitValue(*I); + assert(Value != -1U); + + CodePoint <<= 4; + CodePoint += Value; + } + + appendCodePoint(CodePoint, Buf); + --I; + } +} + /// ProcessUCNEscape - Read the Universal Character Name, check constraints and /// return the UTF32. static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, @@ -625,8 +667,9 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling, } if (s != ThisTokEnd) { - if (isValidUDSuffix(PP.getLangOpts(), - StringRef(SuffixBegin, ThisTokEnd - SuffixBegin))) { + // FIXME: Don't bother expanding UCNs if !tok.hasUCN(). + expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)); + if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) { // Any suffix pieces we might have parsed are actually part of the // ud-suffix. isLong = false; @@ -992,7 +1035,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, do { --end; } while (end[-1] != '\''); - UDSuffixBuf.assign(end, UDSuffixEnd); + // FIXME: Don't bother with this if !tok.hasUCN(). + expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end)); UDSuffixOffset = end - TokBegin; } @@ -1311,23 +1355,34 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd); if (UDSuffixBuf.empty()) { - UDSuffixBuf.assign(UDSuffix); + if (StringToks[i].hasUCN()) + expandUCNs(UDSuffixBuf, UDSuffix); + else + UDSuffixBuf.assign(UDSuffix); UDSuffixToken = i; UDSuffixOffset = ThisTokEnd - ThisTokBuf; UDSuffixTokLoc = StringToks[i].getLocation(); - } else if (!UDSuffixBuf.equals(UDSuffix)) { + } else { + SmallString<32> ExpandedUDSuffix; + if (StringToks[i].hasUCN()) { + expandUCNs(ExpandedUDSuffix, UDSuffix); + UDSuffix = ExpandedUDSuffix; + } + // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the // result of a concatenation involving at least one user-defined-string- // literal, all the participating user-defined-string-literals shall // have the same ud-suffix. - if (Diags) { - SourceLocation TokLoc = StringToks[i].getLocation(); - Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix) - << UDSuffixBuf << UDSuffix - << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc) - << SourceRange(TokLoc, TokLoc); + if (!UDSuffixBuf.equals(UDSuffix)) { + if (Diags) { + SourceLocation TokLoc = StringToks[i].getLocation(); + Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix) + << UDSuffixBuf << UDSuffix + << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc) + << SourceRange(TokLoc, TokLoc); + } + hadError = true; } - hadError = true; } } diff --git a/lib/Lex/Preprocessor.cpp b/lib/Lex/Preprocessor.cpp index 9ffc83ceff..1e54ab3725 100644 --- a/lib/Lex/Preprocessor.cpp +++ b/lib/Lex/Preprocessor.cpp @@ -503,48 +503,6 @@ void Preprocessor::EndSourceFile() { // Lexer Event Handling. //===----------------------------------------------------------------------===// -static void appendCodePoint(unsigned Codepoint, - llvm::SmallVectorImpl &Str) { - char ResultBuf[4]; - char *ResultPtr = ResultBuf; - bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr); - (void)Res; - assert(Res && "Unexpected conversion failure"); - Str.append(ResultBuf, ResultPtr); -} - -static void expandUCNs(SmallVectorImpl &Buf, StringRef Input) { - for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) { - if (*I != '\\') { - Buf.push_back(*I); - continue; - } - - ++I; - assert(*I == 'u' || *I == 'U'); - - unsigned NumHexDigits; - if (*I == 'u') - NumHexDigits = 4; - else - NumHexDigits = 8; - - assert(I + NumHexDigits <= E); - - uint32_t CodePoint = 0; - for (++I; NumHexDigits != 0; ++I, --NumHexDigits) { - unsigned Value = llvm::hexDigitValue(*I); - assert(Value != -1U); - - CodePoint <<= 4; - CodePoint += Value; - } - - appendCodePoint(CodePoint, Buf); - --I; - } -} - /// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the /// identifier information for the token and install it into the token, /// updating the token kind accordingly. diff --git a/test/Parser/cxx11-user-defined-literals.cpp b/test/Parser/cxx11-user-defined-literals.cpp index 613c0b9ec6..31032e7f7f 100644 --- a/test/Parser/cxx11-user-defined-literals.cpp +++ b/test/Parser/cxx11-user-defined-literals.cpp @@ -111,3 +111,35 @@ void operator "" "" U"" // expected-error {{cannot have an encoding prefix}} "" _also_not_char(const char *); void operator "" u8"" "\u0123" "hello"_all_of_the_things ""(const char*); // expected-error {{must be '""'}} + +// Make sure we treat UCNs and UTF-8 as equivalent. +int operator""_µs(unsigned long long) {} // expected-note {{previous}} +int hundred_µs = 50_µs + 50_\u00b5s; +int operator""_\u00b5s(unsigned long long) {} // expected-error {{redefinition of 'operator "" _µs'}} + +int operator""_\U0000212B(long double) {} // expected-note {{previous}} +int hundred_Å = 50.0_Å + 50._\U0000212B; +int operator""_Å(long double) {} // expected-error {{redefinition of 'operator "" _Å'}} + +int operator""_𐀀(char) {} // expected-note {{previous}} +int 𐀀 = '4'_𐀀 + '2'_\U00010000; +int operator""_\U00010000(char) {} // expected-error {{redefinition of 'operator "" _𐀀'}} + +// These all declare the same function. +int operator""_℮""_\u212e""_\U0000212e""(const char*, size_t); +int operator""_\u212e""_\U0000212e""_℮""(const char*, size_t); +int operator""_\U0000212e""_℮""_\u212e""(const char*, size_t); +int mix_ucn_utf8 = ""_℮""_\u212e""_\U0000212e""; + +void operator""_℮""_ℯ(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}} +void operator""_℮""_\u212f(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}} +void operator""_\u212e""_ℯ(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}} +void operator""_\u212e""_\u212f(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}} + +void operator""_℮""_℮(unsigned long long) {} // expected-note {{previous}} +void operator""_\u212e""_\u212e(unsigned long long) {} // expected-error {{redefinition}} + +#define ¢ *0.01 // expected-error {{macro names must be identifiers}} +constexpr int operator""_¢(long double d) { return d * 100; } // expected-error {{non-ASCII}} +constexpr int operator""_¢(unsigned long long n) { return n; } // expected-error {{non-ASCII}} +static_assert(0.02_¢ == 2_¢, ""); // expected-error 2{{non-ASCII}} -- 2.40.0