Add support for UCNs for character literals

author Nico Weber <nicolasweber@gmx.de>

Sat, 9 Oct 2010 00:27:47 +0000 (00:27 +0000)

committer Nico Weber <nicolasweber@gmx.de>

Sat, 9 Oct 2010 00:27:47 +0000 (00:27 +0000)
author Nico Weber <nicolasweber@gmx.de>
Sat, 9 Oct 2010 00:27:47 +0000 (00:27 +0000)
committer Nico Weber <nicolasweber@gmx.de>
Sat, 9 Oct 2010 00:27:47 +0000 (00:27 +0000)
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp

index 9b7c46f091166046399f8f689c9255019fae1c8e..3b739b3d0b58f4a17905098f1891aaa2bd90882e 100644 (file)
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -164,13 +164,10 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
  }
  
  /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
-/// convert the UTF32 to UTF8. This is a subroutine of StringLiteralParser.
-/// When we decide to implement UCN's for character constants and identifiers,
-/// we will likely rework our support for UCN's.
-static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
-                             char *&ResultBuf, bool &HadError,
+/// return the UTF32.
+static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
+                             uint32_t &UcnVal, unsigned short &UcnLen,
                               SourceLocation Loc, Preprocessor &PP,
-                             bool wide,
                               bool Complain) {
    if (!PP.getLangOptions().CPlusPlus && !PP.getLangOptions().C99)
      PP.Diag(Loc, diag::warn_ucn_not_valid_in_c89);
@@ -184,27 +181,22 @@ static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
    if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
      if (Complain)
        PP.Diag(Loc, diag::err_ucn_escape_no_digits);
-    HadError = 1;
-    return;
+    return false;
    }
-  typedef uint32_t UTF32;
-
-  UTF32 UcnVal = 0;
-  unsigned short UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
+  UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
    unsigned short UcnLenSave = UcnLen;
-  for (; ThisTokBuf != ThisTokEnd && UcnLen; ++ThisTokBuf, UcnLen--) {
+  for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
      int CharVal = HexDigitValue(ThisTokBuf[0]);
      if (CharVal == -1) break;
      UcnVal <<= 4;
      UcnVal |= CharVal;
    }
    // If we didn't consume the proper number of digits, there is a problem.
-  if (UcnLen) {
+  if (UcnLenSave) {
      if (Complain)
        PP.Diag(PP.AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin),
                diag::err_ucn_escape_incomplete);
-    HadError = 1;
-    return;
+    return false;
    }
    // Check UCN constraints (C99 6.4.3p2).
    if ((UcnVal < 0xa0 &&
@@ -213,13 +205,33 @@ static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
        || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
      if (Complain)
        PP.Diag(Loc, diag::err_ucn_escape_invalid);
+    return false;
+  }
+  return true;
+}
+
+/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
+/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
+/// StringLiteralParser. When we decide to implement UCN's for identifiers,
+/// we will likely rework our support for UCN's.
+static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
+                             char *&ResultBuf, bool &HadError,
+                             SourceLocation Loc, Preprocessor &PP,
+                             bool wide,
+                             bool Complain) {
+  typedef uint32_t UTF32;
+  UTF32 UcnVal = 0;
+  unsigned short UcnLen = 0;
+  if (!ProcessUCNEscape(ThisTokBuf, ThisTokEnd,
+                        UcnVal, UcnLen, Loc, PP, Complain)) {
      HadError = 1;
      return;
    }
+
    if (wide) {
-    (void)UcnLenSave;
-    assert((UcnLenSave == 4 || UcnLenSave == 8) && 
-           "ProcessUCNEscape - only ucn length of 4 or 8 supported");
+    (void)UcnLen;
+    assert((UcnLen== 4 || UcnLen== 8) && 
+           "EncodeUCNEscape - only ucn length of 4 or 8 supported");
  
      if (!PP.getLangOptions().ShortWChar) {
        // Note: our internal rep of wide char tokens is always little-endian.
@@ -702,11 +714,26 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
    bool Warned = false;
    while (begin[0] != '\'') {
      uint64_t ResultChar;
+
+      // Is this a Universal Character Name escape?
      if (begin[0] != '\\')     // If this is a normal character, consume it.
        ResultChar = *begin++;
-    else                      // Otherwise, this is an escape character.
-      ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP,
-                                     /*Complain=*/true);
+    else {                    // Otherwise, this is an escape character.
+      // Check for UCN.
+      if (begin[1] == 'u' || begin[1] == 'U') {
+        uint32_t utf32 = 0;
+        unsigned short UcnLen = 0;
+        if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
+                              Loc, PP, /*Complain=*/true)) {
+          HadError = 1;
+        }
+        ResultChar = utf32;
+      } else {
+        // Otherwise, this is a non-UCN escape character.  Process it.
+        ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP,
+                                       /*Complain=*/true);
+      }
+    }
  
      // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
      // implementation defined (C99 6.4.4.4p10).
@@ -746,6 +773,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
    // Transfer the value from APInt to uint64_t
    Value = LitVal.getZExtValue();
  
+  if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF)
+    PP.Diag(Loc, diag::warn_ucn_escape_too_large);
+
    // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
    // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
    // character constants are not sign extended in the this implementation:
@@ -915,9 +945,9 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
        }
        // Is this a Universal Character Name escape?
        if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
-        ProcessUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
-                         hadError, StringToks[i].getLocation(), PP, wide, 
-                         Complain);
+        EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
+                        hadError, StringToks[i].getLocation(), PP, wide, 
+                        Complain);
          continue;
        }
        // Otherwise, this is a non-UCN escape character.  Process it.
diff --git a/test/CodeGen/char-literal.c b/test/CodeGen/char-literal.c

new file mode 100644 (file)

index 0000000..aff76d2
--- /dev/null
+++ b/test/CodeGen/char-literal.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -x c++ -triple i386-unknown-unkown -emit-llvm %s -o - | FileCheck %s
+// Runs in c++ mode so that wchar_t is available.
+
+int main() {
+  // CHECK: store i8 97
+  char a = 'a';
+
+  // Should pick second character.
+  // CHECK: store i8 98
+  char b = 'ab';
+
+  // CHECK: store i32 97
+  wchar_t wa = L'a';
+
+  // Should pick second character.
+  // CHECK: store i32 98
+  wchar_t wb = L'ab';
+
+  // Should pick last character and store its lowest byte.
+  // This does not match gcc, which takes the last character, converts it to
+  // utf8, and then picks the second-lowest byte of that (they probably store
+  // the utf8 in uint16_ts internally and take the lower byte of that).
+  // CHECK: store i8 48
+  char c = '\u1120\u0220\U00102030';
+
+  // CHECK: store i32 61451
+  wchar_t wc = L'\uF00B';
+
+  // CHECK: store i32 1110027
+  wchar_t wd = L'\U0010F00B';
+
+  // Should pick second character.
+  // CHECK: store i32 1110027
+  wchar_t we = L'\u1234\U0010F00B';
+}
diff --git a/test/CodeGen/string-literal-short-wstring.c b/test/CodeGen/string-literal-short-wstring.c

index de84953dd3f9f694cfe56dc390781fe6e645c4d1..be1f1dd66c7c34125c02238a9c8c86fbb324b321 100644 (file)
--- a/test/CodeGen/string-literal-short-wstring.c
+++ b/test/CodeGen/string-literal-short-wstring.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -emit-llvm -fshort-wchar %s -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -emit-llvm -fshort-wchar %s -o - | FileCheck %s
+// Runs in c++ mode so that wchar_t is available.
  
  int main() {
    // This should convert to utf8.
@@ -6,9 +7,37 @@ int main() {
    char b[10] = "\u1120\u0220\U00102030";
  
    // CHECK: private constant [6 x i8] c"A\00B\00\00\00"
-  void *foo = L"AB";
+  const wchar_t *foo = L"AB";
  
    // This should convert to utf16.
    // CHECK: private constant [10 x i8] c" \11 \02\C8\DB0\DC\00\00"
-  void *bar = L"\u1120\u0220\U00102030";
+  const wchar_t *bar = L"\u1120\u0220\U00102030";
+
+
+
+  // Should pick second character.
+  // CHECK: store i8 98
+  char c = 'ab';
+
+  // CHECK: store i16 97
+  wchar_t wa = L'a';
+
+  // Should pick second character.
+  // CHECK: store i16 98
+  wchar_t wb = L'ab';
+
+  // -4085 == 0xf00b
+  // CHECK: store i16 -4085
+  wchar_t wc = L'\uF00B';
+
+  // Should take lower word of the 4byte UNC sequence. This does not match
+  // gcc. I don't understand what gcc does (it looks like it converts to utf16,
+  // then takes the second (!) utf16 word, swaps the lower two nibbles, and
+  // stores that?).
+  // CHECK: store i16 -4085
+  wchar_t wd = L'\U0010F00B';  // has utf16 encoding dbc8 dcb0
+
+  // Should pick second character. (gcc: -9205)
+  // CHECK: store i16 -4085
+  wchar_t we = L'\u1234\U0010F00B';
  }
diff --git a/test/Lexer/c90.c b/test/Lexer/c90.c

index f74135542c4fd0c2405774fb4efc35e93924c3e9..d91057257dfbd4d425d39b8b2562e536ea2e4ee2 100644 (file)
--- a/test/Lexer/c90.c
+++ b/test/Lexer/c90.c
@@ -30,4 +30,5 @@ void test2() {
  
  void test3() {
    (void)L"\u1234";  // expected-error {{unicode escape sequences are only valid in C99 or C++}}
+  (void)L'\u1234';  // expected-error {{unicode escape sequences are only valid in C99 or C++}}
  }
diff --git a/test/Lexer/wchar.c b/test/Lexer/wchar.c

index cbc0c455f82f6bd33c4bf73c879c9ca7dc3763b3..ac82c1f73b4d94f194fff7fbc5163e19287b37f9 100644 (file)
--- a/test/Lexer/wchar.c
+++ b/test/Lexer/wchar.c
@@ -2,5 +2,11 @@
  
  void f() {
    (void)L"\U00010000";  // expected-warning {{character unicode escape sequence too long for its type}}
+
+  (void)L'\U00010000';  // expected-warning {{character unicode escape sequence too long for its type}}
+
+  (void)L'ab';  // expected-warning {{extraneous characters in wide character constant ignored}}
+
+  (void)L'a\u1000';  // expected-warning {{extraneous characters in wide character constant ignored}}
  }
author	Nico Weber <nicolasweber@gmx.de>
	Sat, 9 Oct 2010 00:27:47 +0000 (00:27 +0000)
committer	Nico Weber <nicolasweber@gmx.de>
	Sat, 9 Oct 2010 00:27:47 +0000 (00:27 +0000)
lib/Lex/LiteralSupport.cpp		patch \| blob \| history
test/CodeGen/char-literal.c	[new file with mode: 0644]	patch \| blob
test/CodeGen/string-literal-short-wstring.c		patch \| blob \| history
test/Lexer/c90.c		patch \| blob \| history
test/Lexer/wchar.c		patch \| blob \| history