Implement UCN support for C string literals (C99 6.4.3) and add some very basic tests...

author Steve Naroff <snaroff@apple.com>

Mon, 30 Mar 2009 23:46:03 +0000 (23:46 +0000)

committer Steve Naroff <snaroff@apple.com>

Mon, 30 Mar 2009 23:46:03 +0000 (23:46 +0000)
author Steve Naroff <snaroff@apple.com>
Mon, 30 Mar 2009 23:46:03 +0000 (23:46 +0000)
committer Steve Naroff <snaroff@apple.com>
Mon, 30 Mar 2009 23:46:03 +0000 (23:46 +0000)
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td

index 98f1be2fe419440d771cd8c8de3762eed58678fa..82ebdaddc1727c25c9477c53bc81e8280198d9c3 100644 (file)
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -52,6 +52,10 @@ def ext_nonstandard_escape : Extension<
    "use of non-standard escape character '\\%0'">;
  def ext_unknown_escape : Extension<"unknown escape sequence '\\%0'">;
  def err_hex_escape_no_digits : Error<"\\x used with no following hex digits">;
+def err_ucn_escape_no_digits : Error<"\\u used with no following hex digits">;
+def err_ucn_escape_invalid : Error<"invalid universal character">;
+def err_ucn_escape_incomplete : Error<"incomplete universal character name">;
+def err_ucn_escape_too_big : Error<"universal character name is too long">;
  def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">;
  def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">;
  def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">;
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp

index c20383f0313312cc1a490ca711b4519cd44cc557..dcd239d5abd41235658536c90135656e3d50fb88 100644 (file)
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -71,8 +71,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
    case 'v':
      ResultChar = 11;
      break;
-    
-    //case 'u': case 'U':  // FIXME: UCNs.
    case 'x': { // Hex escape.
      ResultChar = 0;
      if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
@@ -151,7 +149,90 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
    return ResultChar;
  }
  
-
+/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
+/// convert the UTF32 to UTF8. This is a subroutine of StringLiteralParser.
+/// When we decide to implement UCN's for character constants and identifiers,
+/// we will likely rework our support for UCN's.
+static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, 
+                             char *&ResultBuf, const char *ResultBufEnd,
+                             bool &HadError, 
+                             SourceLocation Loc, Preprocessor &PP) {
+  // FIXME: Add a warning - UCN's are only valid in C++ & C99.
+  
+  // Skip the '\u' char's.
+  ThisTokBuf += 2;
+
+  if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
+    PP.Diag(Loc, diag::err_ucn_escape_no_digits);
+    HadError = 1;
+    return;
+  }
+  typedef unsigned int UTF32;
+  
+  UTF32 UcnVal = 0;
+  unsigned short UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
+  for (; ThisTokBuf != ThisTokEnd && UcnLen; ++ThisTokBuf, UcnLen--) {
+    int CharVal = HexDigitValue(ThisTokBuf[0]);
+    if (CharVal == -1) break;
+    UcnVal <<= 4;
+    UcnVal |= CharVal;
+  }
+  // If we didn't consume the proper number of digits, there is a problem.
+  if (UcnLen) {
+    PP.Diag(Loc, diag::err_ucn_escape_incomplete);
+    HadError = 1;
+    return;
+  }
+  // Check UCN constraints (C99 6.4.3p2)
+  if ((UcnVal < 0xa0 &&
+      (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
+      || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)) {
+    PP.Diag(Loc, diag::err_ucn_escape_invalid);
+    HadError = 1;
+    return;
+  }
+  // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
+  // The conversion below was inspired by:
+  //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
+  // First, we determine how many bytes the result will require. 
+  typedef unsigned char UTF8;
+
+  unsigned short bytesToWrite = 0;
+  if (UcnVal < (UTF32)0x80)
+    bytesToWrite = 1;
+  else if (UcnVal < (UTF32)0x800)
+    bytesToWrite = 2;
+  else if (UcnVal < (UTF32)0x10000)
+    bytesToWrite = 3;
+  else
+    bytesToWrite = 4;
+       
+  // If the buffer isn't big enough, bail.
+  if ((ResultBuf + bytesToWrite) >= ResultBufEnd) {
+    PP.Diag(Loc, diag::err_ucn_escape_too_big);
+    HadError = 1;
+    return;
+  }
+  const unsigned byteMask = 0xBF;
+  const unsigned byteMark = 0x80;
+  
+  // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
+  // into the first byte, depending on how many bytes follow.  There are
+  // as many entries in this table as there are UTF8 sequence types.
+  static const UTF8 firstByteMark[7] = { 
+    0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC 
+  };
+  // Finally, we write the bytes into ResultBuf.
+  ResultBuf += bytesToWrite;
+  switch (bytesToWrite) { // note: everything falls through.
+    case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
+    case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
+    case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
+    case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
+  }
+  // Update the buffer.
+  ResultBuf += bytesToWrite;
+}
  
  
  ///       integer-constant: [C99 6.4.4.1]
@@ -757,23 +838,29 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
              *ResultPtr++ = InStart[0];
              // Add zeros at the end.
              for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
-            *ResultPtr++ = 0;
+              *ResultPtr++ = 0;
            }
          }
          continue;
        }
        
-      // Otherwise, this is an escape character.  Process it.
-      unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
-                                              StringToks[i].getLocation(),
-                                              ThisIsWide, PP);
-      
-      // Note: our internal rep of wide char tokens is always little-endian.
-      *ResultPtr++ = ResultChar & 0xFF;
-      
-      if (AnyWide) {
-        for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
-          *ResultPtr++ = ResultChar >> i*8;
+      if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
+        ProcessUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, 
+                         GetString() + ResultBuf.size(),
+                         hadError, StringToks[i].getLocation(), PP);
+      } else {
+        // Otherwise, this is a non-UCN escape character.  Process it.
+        unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
+                                                StringToks[i].getLocation(),
+                                                ThisIsWide, PP);
+        
+        // Note: our internal rep of wide char tokens is always little-endian.
+        *ResultPtr++ = ResultChar & 0xFF;
+        
+        if (AnyWide) {
+          for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
+            *ResultPtr++ = ResultChar >> i*8;
+        }
        }
      }
    }
diff --git a/test/Sema/ucn-cstring.c b/test/Sema/ucn-cstring.c

new file mode 100644 (file)

index 0000000..ec760f4
--- /dev/null
+++ b/test/Sema/ucn-cstring.c
@@ -0,0 +1,15 @@
+// RUN: clang-cc %s -verify -fsyntax-only -pedantic
+
+#include <stdio.h>
+
+int main(void) {
+  printf("%s (%d)\n", "hello \u2192 \u2603 \u2190 world", sizeof("hello \u2192 \u2603 \u2190 world"));
+  printf("%s (%d)\n", "\U00010400\U0001D12B", sizeof("\U00010400\U0001D12B"));
+  // Some error conditions...
+  printf("%s\n", "\U"); // expected-error{{\u used with no following hex digits}}
+  printf("%s\n", "\U00"); // expected-error{{incomplete universal character name}}
+  printf("%s\n", "\U0001"); // expected-error{{incomplete universal character name}}
+  printf("%s\n", "\u0001"); // expected-error{{invalid universal character}}
+  return 0;
+}
+
author	Steve Naroff <snaroff@apple.com>
	Mon, 30 Mar 2009 23:46:03 +0000 (23:46 +0000)
committer	Steve Naroff <snaroff@apple.com>
	Mon, 30 Mar 2009 23:46:03 +0000 (23:46 +0000)
include/clang/Basic/DiagnosticLexKinds.td		patch \| blob \| history
lib/Lex/LiteralSupport.cpp		patch \| blob \| history
test/Sema/ucn-cstring.c	[new file with mode: 0644]	patch \| blob