From: Nico Weber Date: Sat, 9 Oct 2010 00:27:47 +0000 (+0000) Subject: Add support for UCNs for character literals X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=59705aee3fe01aa6fb6962dd11350161b47983d9;p=clang Add support for UCNs for character literals git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@116129 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp index 9b7c46f091..3b739b3d0b 100644 --- a/lib/Lex/LiteralSupport.cpp +++ b/lib/Lex/LiteralSupport.cpp @@ -164,13 +164,10 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, } /// ProcessUCNEscape - Read the Universal Character Name, check constraints and -/// convert the UTF32 to UTF8. This is a subroutine of StringLiteralParser. -/// When we decide to implement UCN's for character constants and identifiers, -/// we will likely rework our support for UCN's. -static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, - char *&ResultBuf, bool &HadError, +/// return the UTF32. +static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, + uint32_t &UcnVal, unsigned short &UcnLen, SourceLocation Loc, Preprocessor &PP, - bool wide, bool Complain) { if (!PP.getLangOptions().CPlusPlus && !PP.getLangOptions().C99) PP.Diag(Loc, diag::warn_ucn_not_valid_in_c89); @@ -184,27 +181,22 @@ static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) { if (Complain) PP.Diag(Loc, diag::err_ucn_escape_no_digits); - HadError = 1; - return; + return false; } - typedef uint32_t UTF32; - - UTF32 UcnVal = 0; - unsigned short UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8); + UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8); unsigned short UcnLenSave = UcnLen; - for (; ThisTokBuf != ThisTokEnd && UcnLen; ++ThisTokBuf, UcnLen--) { + for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) { int CharVal = HexDigitValue(ThisTokBuf[0]); if (CharVal == -1) break; UcnVal <<= 4; UcnVal |= CharVal; } // If we didn't consume the proper number of digits, there is a problem. - if (UcnLen) { + if (UcnLenSave) { if (Complain) PP.Diag(PP.AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin), diag::err_ucn_escape_incomplete); - HadError = 1; - return; + return false; } // Check UCN constraints (C99 6.4.3p2). if ((UcnVal < 0xa0 && @@ -213,13 +205,33 @@ static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ { if (Complain) PP.Diag(Loc, diag::err_ucn_escape_invalid); + return false; + } + return true; +} + +/// EncodeUCNEscape - Read the Universal Character Name, check constraints and +/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of +/// StringLiteralParser. When we decide to implement UCN's for identifiers, +/// we will likely rework our support for UCN's. +static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, + char *&ResultBuf, bool &HadError, + SourceLocation Loc, Preprocessor &PP, + bool wide, + bool Complain) { + typedef uint32_t UTF32; + UTF32 UcnVal = 0; + unsigned short UcnLen = 0; + if (!ProcessUCNEscape(ThisTokBuf, ThisTokEnd, + UcnVal, UcnLen, Loc, PP, Complain)) { HadError = 1; return; } + if (wide) { - (void)UcnLenSave; - assert((UcnLenSave == 4 || UcnLenSave == 8) && - "ProcessUCNEscape - only ucn length of 4 or 8 supported"); + (void)UcnLen; + assert((UcnLen== 4 || UcnLen== 8) && + "EncodeUCNEscape - only ucn length of 4 or 8 supported"); if (!PP.getLangOptions().ShortWChar) { // Note: our internal rep of wide char tokens is always little-endian. @@ -702,11 +714,26 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, bool Warned = false; while (begin[0] != '\'') { uint64_t ResultChar; + + // Is this a Universal Character Name escape? if (begin[0] != '\\') // If this is a normal character, consume it. ResultChar = *begin++; - else // Otherwise, this is an escape character. - ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP, - /*Complain=*/true); + else { // Otherwise, this is an escape character. + // Check for UCN. + if (begin[1] == 'u' || begin[1] == 'U') { + uint32_t utf32 = 0; + unsigned short UcnLen = 0; + if (!ProcessUCNEscape(begin, end, utf32, UcnLen, + Loc, PP, /*Complain=*/true)) { + HadError = 1; + } + ResultChar = utf32; + } else { + // Otherwise, this is a non-UCN escape character. Process it. + ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP, + /*Complain=*/true); + } + } // If this is a multi-character constant (e.g. 'abc'), handle it. These are // implementation defined (C99 6.4.4.4p10). @@ -746,6 +773,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, // Transfer the value from APInt to uint64_t Value = LitVal.getZExtValue(); + if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF) + PP.Diag(Loc, diag::warn_ucn_escape_too_large); + // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple // character constants are not sign extended in the this implementation: @@ -915,9 +945,9 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks, } // Is this a Universal Character Name escape? if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { - ProcessUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, - hadError, StringToks[i].getLocation(), PP, wide, - Complain); + EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, + hadError, StringToks[i].getLocation(), PP, wide, + Complain); continue; } // Otherwise, this is a non-UCN escape character. Process it. diff --git a/test/CodeGen/char-literal.c b/test/CodeGen/char-literal.c new file mode 100644 index 0000000000..aff76d280d --- /dev/null +++ b/test/CodeGen/char-literal.c @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 -x c++ -triple i386-unknown-unkown -emit-llvm %s -o - | FileCheck %s +// Runs in c++ mode so that wchar_t is available. + +int main() { + // CHECK: store i8 97 + char a = 'a'; + + // Should pick second character. + // CHECK: store i8 98 + char b = 'ab'; + + // CHECK: store i32 97 + wchar_t wa = L'a'; + + // Should pick second character. + // CHECK: store i32 98 + wchar_t wb = L'ab'; + + // Should pick last character and store its lowest byte. + // This does not match gcc, which takes the last character, converts it to + // utf8, and then picks the second-lowest byte of that (they probably store + // the utf8 in uint16_ts internally and take the lower byte of that). + // CHECK: store i8 48 + char c = '\u1120\u0220\U00102030'; + + // CHECK: store i32 61451 + wchar_t wc = L'\uF00B'; + + // CHECK: store i32 1110027 + wchar_t wd = L'\U0010F00B'; + + // Should pick second character. + // CHECK: store i32 1110027 + wchar_t we = L'\u1234\U0010F00B'; +} diff --git a/test/CodeGen/string-literal-short-wstring.c b/test/CodeGen/string-literal-short-wstring.c index de84953dd3..be1f1dd66c 100644 --- a/test/CodeGen/string-literal-short-wstring.c +++ b/test/CodeGen/string-literal-short-wstring.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -emit-llvm -fshort-wchar %s -o - | FileCheck %s +// RUN: %clang_cc1 -x c++ -emit-llvm -fshort-wchar %s -o - | FileCheck %s +// Runs in c++ mode so that wchar_t is available. int main() { // This should convert to utf8. @@ -6,9 +7,37 @@ int main() { char b[10] = "\u1120\u0220\U00102030"; // CHECK: private constant [6 x i8] c"A\00B\00\00\00" - void *foo = L"AB"; + const wchar_t *foo = L"AB"; // This should convert to utf16. // CHECK: private constant [10 x i8] c" \11 \02\C8\DB0\DC\00\00" - void *bar = L"\u1120\u0220\U00102030"; + const wchar_t *bar = L"\u1120\u0220\U00102030"; + + + + // Should pick second character. + // CHECK: store i8 98 + char c = 'ab'; + + // CHECK: store i16 97 + wchar_t wa = L'a'; + + // Should pick second character. + // CHECK: store i16 98 + wchar_t wb = L'ab'; + + // -4085 == 0xf00b + // CHECK: store i16 -4085 + wchar_t wc = L'\uF00B'; + + // Should take lower word of the 4byte UNC sequence. This does not match + // gcc. I don't understand what gcc does (it looks like it converts to utf16, + // then takes the second (!) utf16 word, swaps the lower two nibbles, and + // stores that?). + // CHECK: store i16 -4085 + wchar_t wd = L'\U0010F00B'; // has utf16 encoding dbc8 dcb0 + + // Should pick second character. (gcc: -9205) + // CHECK: store i16 -4085 + wchar_t we = L'\u1234\U0010F00B'; } diff --git a/test/Lexer/c90.c b/test/Lexer/c90.c index f74135542c..d91057257d 100644 --- a/test/Lexer/c90.c +++ b/test/Lexer/c90.c @@ -30,4 +30,5 @@ void test2() { void test3() { (void)L"\u1234"; // expected-error {{unicode escape sequences are only valid in C99 or C++}} + (void)L'\u1234'; // expected-error {{unicode escape sequences are only valid in C99 or C++}} } diff --git a/test/Lexer/wchar.c b/test/Lexer/wchar.c index cbc0c455f8..ac82c1f73b 100644 --- a/test/Lexer/wchar.c +++ b/test/Lexer/wchar.c @@ -2,5 +2,11 @@ void f() { (void)L"\U00010000"; // expected-warning {{character unicode escape sequence too long for its type}} + + (void)L'\U00010000'; // expected-warning {{character unicode escape sequence too long for its type}} + + (void)L'ab'; // expected-warning {{extraneous characters in wide character constant ignored}} + + (void)L'a\u1000'; // expected-warning {{extraneous characters in wide character constant ignored}} }