Add support for 4-byte UCNs like \U12345678. Warn about UCNs in c90 mode.

author Nico Weber <nicolasweber@gmx.de>

Wed, 6 Oct 2010 04:57:26 +0000 (04:57 +0000)

committer Nico Weber <nicolasweber@gmx.de>

Wed, 6 Oct 2010 04:57:26 +0000 (04:57 +0000)
author Nico Weber <nicolasweber@gmx.de>
Wed, 6 Oct 2010 04:57:26 +0000 (04:57 +0000)
committer Nico Weber <nicolasweber@gmx.de>
Wed, 6 Oct 2010 04:57:26 +0000 (04:57 +0000)
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td

index dcb05c8fcd45ff86ad215b30ccb0a397fa1a02f5..8f61c69f9ea282aed5fc382c723aaa29d6c61c3d 100644 (file)
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -98,6 +98,10 @@ def warn_hex_escape_too_large : ExtWarn<"hex escape sequence out of range">;
  def ext_string_too_long : Extension<"string literal of length %0 exceeds "
    "maximum length %1 that %select{C90|ISO C99|C++}2 compilers are required to "
    "support">, InGroup<OverlengthStrings>;
+def warn_ucn_escape_too_large : ExtWarn<
+  "character unicode escape sequence too long for its type">;
+def warn_ucn_not_valid_in_c89 : ExtWarn<
+  "unicode escape sequences are only valid in C99 or C++">;
    
  //===----------------------------------------------------------------------===//
  // PTH Diagnostics
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp

index fb543d0f03b3d24626886b0a0113698ea8d53879..9b7c46f091166046399f8f689c9255019fae1c8e 100644 (file)
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -172,8 +172,8 @@ static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
                               SourceLocation Loc, Preprocessor &PP,
                               bool wide,
                               bool Complain) {
-  // FIXME: Add a warning - UCN's are only valid in C++ & C99.
-  // FIXME: Handle wide strings.
+  if (!PP.getLangOptions().CPlusPlus && !PP.getLangOptions().C99)
+    PP.Diag(Loc, diag::warn_ucn_not_valid_in_c89);
  
    // Save the beginning of the string (for error diagnostics).
    const char *ThisTokBegin = ThisTokBuf;
@@ -218,13 +218,34 @@ static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
    }
    if (wide) {
      (void)UcnLenSave;
-    assert(UcnLenSave == 4 && 
-           "ProcessUCNEscape - only ucn length of 4 supported");
-    // little endian assumed.
-    *ResultBuf++ = (UcnVal & 0x000000FF);
-    *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
-    *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
-    *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
+    assert((UcnLenSave == 4 || UcnLenSave == 8) && 
+           "ProcessUCNEscape - only ucn length of 4 or 8 supported");
+
+    if (!PP.getLangOptions().ShortWChar) {
+      // Note: our internal rep of wide char tokens is always little-endian.
+      *ResultBuf++ = (UcnVal & 0x000000FF);
+      *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
+      *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
+      *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
+      return;
+    }
+
+    // Convert to UTF16.
+    if (UcnVal < (UTF32)0xFFFF) {
+      *ResultBuf++ = (UcnVal & 0x000000FF);
+      *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
+      return;
+    }
+    PP.Diag(Loc, diag::warn_ucn_escape_too_large);
+
+    typedef uint16_t UTF16;
+    UcnVal -= 0x10000;
+    UTF16 surrogate1 = 0xD800 + (UcnVal >> 10);
+    UTF16 surrogate2 = 0xDC00 + (UcnVal & 0x3FF);
+    *ResultBuf++ = (surrogate1 & 0x000000FF);
+    *ResultBuf++ = (surrogate1 & 0x0000FF00) >> 8;
+    *ResultBuf++ = (surrogate2 & 0x000000FF);
+    *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8;
      return;
    }
    // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
diff --git a/test/CodeGen/string-literal-short-wstring.c b/test/CodeGen/string-literal-short-wstring.c

new file mode 100644 (file)

index 0000000..de84953
--- /dev/null
+++ b/test/CodeGen/string-literal-short-wstring.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -emit-llvm -fshort-wchar %s -o - | FileCheck %s
+
+int main() {
+  // This should convert to utf8.
+  // CHECK: internal constant [10 x i8] c"\E1\84\A0\C8\A0\F4\82\80\B0\00", align 1
+  char b[10] = "\u1120\u0220\U00102030";
+
+  // CHECK: private constant [6 x i8] c"A\00B\00\00\00"
+  void *foo = L"AB";
+
+  // This should convert to utf16.
+  // CHECK: private constant [10 x i8] c" \11 \02\C8\DB0\DC\00\00"
+  void *bar = L"\u1120\u0220\U00102030";
+}
diff --git a/test/CodeGen/string-literal.c b/test/CodeGen/string-literal.c

index 22a81e71855188a5d93c6a9989b24ad914919f32..457ff6ca7aeee9d244526c97a61c30e6341c210b 100644 (file)
--- a/test/CodeGen/string-literal.c
+++ b/test/CodeGen/string-literal.c
@@ -1,7 +1,16 @@
-// RUN: %clang_cc1 -emit-llvm %s -o -
+// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s
  
  int main() {
+  // CHECK: internal constant [10 x i8] c"abc\00\00\00\00\00\00\00", align 1
    char a[10] = "abc";
  
+  // This should convert to utf8.
+  // CHECK: internal constant [10 x i8] c"\E1\84\A0\C8\A0\F4\82\80\B0\00", align 1
+  char b[10] = "\u1120\u0220\U00102030";
+
+  // CHECK: private constant [12 x i8] c"A\00\00\00B\00\00\00\00\00\00\00"
    void *foo = L"AB";
+
+  // CHECK: private constant [12 x i8] c"4\12\00\00\0B\F0\10\00\00\00\00\00"
+  void *bar = L"\u1234\U0010F00B";
  }
diff --git a/test/Lexer/c90.c b/test/Lexer/c90.c

index f19139710280d3ffecfb04712bfaed42d11c3d96..f74135542c4fd0c2405774fb4efc35e93924c3e9 100644 (file)
--- a/test/Lexer/c90.c
+++ b/test/Lexer/c90.c
@@ -27,3 +27,7 @@ void test2() {
      "sdjflksdjf lksdjf skldfjsdkljflksdjf kldsjflkdsj fldks jflsdkjfds"
      "sdjflksdjf lksdjf skldfjsdkljflksdjf kldsjflkdsj fldks jflsdkjfds";
  }
+
+void test3() {
+  (void)L"\u1234";  // expected-error {{unicode escape sequences are only valid in C99 or C++}}
+}
diff --git a/test/Lexer/wchar.c b/test/Lexer/wchar.c

new file mode 100644 (file)

index 0000000..cbc0c45
--- /dev/null
+++ b/test/Lexer/wchar.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -fsyntax-only -fshort-wchar -verify %s
+
+void f() {
+  (void)L"\U00010000";  // expected-warning {{character unicode escape sequence too long for its type}}
+}
+
author	Nico Weber <nicolasweber@gmx.de>
	Wed, 6 Oct 2010 04:57:26 +0000 (04:57 +0000)
committer	Nico Weber <nicolasweber@gmx.de>
	Wed, 6 Oct 2010 04:57:26 +0000 (04:57 +0000)
include/clang/Basic/DiagnosticLexKinds.td		patch \| blob \| history
lib/Lex/LiteralSupport.cpp		patch \| blob \| history
test/CodeGen/string-literal-short-wstring.c	[new file with mode: 0644]	patch \| blob
test/CodeGen/string-literal.c		patch \| blob \| history
test/Lexer/c90.c		patch \| blob \| history
test/Lexer/wchar.c	[new file with mode: 0644]	patch \| blob