Improves support for Unicode in character literals

author Seth Cantrell <seth.cantrell@gmail.com>

Wed, 18 Jan 2012 12:27:04 +0000 (12:27 +0000)

committer Seth Cantrell <seth.cantrell@gmail.com>

Wed, 18 Jan 2012 12:27:04 +0000 (12:27 +0000)
author Seth Cantrell <seth.cantrell@gmail.com>
Wed, 18 Jan 2012 12:27:04 +0000 (12:27 +0000)
committer Seth Cantrell <seth.cantrell@gmail.com>
Wed, 18 Jan 2012 12:27:04 +0000 (12:27 +0000)
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td

index f4d867d4fe49a0c352934d8018b4007f7f8ccc8e..547f2722c04722f8beb033b3352841f2c86827c1 100644 (file)
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -107,6 +107,8 @@ def warn_extraneous_char_constant : Warning<
    "extraneous characters in character constant ignored">;
  def warn_char_constant_too_large : Warning<
    "character constant too long for its type">;
+def err_multichar_utf_character_literal : Error<
+  "Unicode character literals may not contain multiple characters">;
  def err_exponent_has_no_digits : Error<"exponent has no digits">;
  def ext_imaginary_constant : Extension<"imaginary constants are an extension">;
  def err_hexconstant_requires_exponent : Error<
@@ -121,8 +123,8 @@ def warn_hex_escape_too_large : ExtWarn<"hex escape sequence out of range">;
  def ext_string_too_long : Extension<"string literal of length %0 exceeds "
    "maximum length %1 that %select{C90|ISO C99|C++}2 compilers are required to "
    "support">, InGroup<OverlengthStrings>;
-def warn_ucn_escape_too_large : ExtWarn<
-  "character unicode escape sequence too long for its type">, InGroup<Unicode>;
+def err_character_too_large : Error<
+  "character too large for enclosing character literal type">;
  def warn_ucn_not_valid_in_c89 : ExtWarn<
    "unicode escape sequences are only valid in C99 or C++">, InGroup<Unicode>;
  def warn_cxx98_compat_unicode_literal : Warning<
@@ -132,6 +134,8 @@ def err_unsupported_string_concat : Error<
    "unsupported non-standard concatenation of string literals">;
  def err_bad_string_encoding : Error<
    "illegal sequence in string literal">;
+def err_bad_character_encoding : Error<
+  "illegal sequence in character literal">;
    
  //===----------------------------------------------------------------------===//
  // PTH Diagnostics
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp

index 296a89461f8263855ead4dc40e8ebc3930390bea..8265c6fde47ebd6ec7cb9d7dbca9745e7b794a3a 100644 (file)
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -182,7 +182,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
  static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
                               uint32_t &UcnVal, unsigned short &UcnLen,
                               FullSourceLoc Loc, DiagnosticsEngine *Diags, 
-                             const LangOptions &Features) {
+                             const LangOptions &Features,
+                             bool in_char_string_literal = false) {
    if (!Features.CPlusPlus && !Features.C99 && Diags)
      Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
  
@@ -216,11 +217,20 @@ static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
      }
      return false;
    }
-  // Check UCN constraints (C99 6.4.3p2).
-  if ((UcnVal < 0xa0 &&
-      (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
-      || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)
-      || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
+  // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
+  bool invalid_ucn = (0xD800<=UcnVal && UcnVal<=0xDFFF) // surrogate codepoints
+                       || 0x10FFFF < UcnVal; // maximum legal UTF32 value
+
+  // C++11 allows UCNs that refer to control characters and basic source
+  // characters inside character and string literals
+  if (!Features.CPlusPlus0x || !in_char_string_literal) {
+    if ((UcnVal < 0xa0 &&
+         (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 ))) {  // $, @, `
+      invalid_ucn = true;
+    }
+  }
+
+  if (invalid_ucn) {
      if (Diags)
        Diags->Report(Loc, diag::err_ucn_escape_invalid);
      return false;
@@ -747,14 +757,13 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
  CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
                                       SourceLocation Loc, Preprocessor &PP,
                                       tok::TokenKind kind) {
-  // At this point we know that the character matches the regex "L?'.*'".
+  // At this point we know that the character matches the regex "(L|u|U)?'.*'".
    HadError = false;
  
    Kind = kind;
  
-  // Determine if this is a wide or UTF character.
-  if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant ||
-      Kind == tok::utf32_char_constant) {
+  // Skip over wide character determinant.
+  if (Kind != tok::char_constant) {
      ++begin;
    }
  
@@ -762,6 +771,10 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
    assert(begin[0] == '\'' && "Invalid token lexed");
    ++begin;
  
+  // Trim the ending quote.
+  assert(end[-1] == '\'' && "Invalid token lexed");
+  --end;
+
    // FIXME: The "Value" is an uint64_t so we can handle char literals of
    // up to 64-bits.
    // FIXME: This extensively assumes that 'char' is 8-bits.
@@ -773,76 +786,114 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
    assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
           "Assumes sizeof(wchar) on target is <= 64");
  
-  // This is what we will use for overflow detection
-  llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
-
-  unsigned NumCharsSoFar = 0;
-  bool Warned = false;
-  while (begin[0] != '\'') {
-    uint64_t ResultChar;
-
-      // Is this a Universal Character Name escape?
-    if (begin[0] != '\\')     // If this is a normal character, consume it.
-      ResultChar = (unsigned char)*begin++;
-    else {                    // Otherwise, this is an escape character.
-      unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
-      // Check for UCN.
-      if (begin[1] == 'u' || begin[1] == 'U') {
-        uint32_t utf32 = 0;
-        unsigned short UcnLen = 0;
-        if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
-                              FullSourceLoc(Loc, PP.getSourceManager()),
-                              &PP.getDiagnostics(), PP.getLangOptions())) {
-          HadError = 1;
-        }
-        ResultChar = utf32;
-        if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
-          PP.Diag(Loc, diag::warn_ucn_escape_too_large);
-          ResultChar &= ~0U >> (32-CharWidth);
-        }
-      } else {
-        // Otherwise, this is a non-UCN escape character.  Process it.
-        ResultChar = ProcessCharEscape(begin, end, HadError,
-                                       FullSourceLoc(Loc,PP.getSourceManager()),
-                                       CharWidth, &PP.getDiagnostics());
-      }
-    }
+  SmallVector<uint32_t,4> codepoint_buffer;
+  codepoint_buffer.resize(end-begin);
+  uint32_t *buffer_begin = &codepoint_buffer.front();
+  uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
+
+  // Unicode escapes representing characters that cannot be correctly
+  // represented in a single code unit are disallowed in character literals
+  // by this implementation.
+  uint32_t largest_character_for_kind;
+  if (tok::wide_char_constant == Kind) {
+    largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
+  } else if (tok::utf16_char_constant == Kind) {
+    largest_character_for_kind = 0xFFFF;
+  } else if (tok::utf32_char_constant == Kind) {
+    largest_character_for_kind = 0x10FFFF;
+  } else {
+    largest_character_for_kind = 0x7Fu;
+  }
  
-    // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
-    // implementation defined (C99 6.4.4.4p10).
-    if (NumCharsSoFar) {
-      if (!isAscii()) {
-        // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
-        LitVal = 0;
+  while (begin!=end) {
+    // Is this a span of non-escape characters?
+    if (begin[0] != '\\') {
+      char const *start = begin;
+      do {
+        ++begin;
+      } while (begin != end && *begin != '\\');
+
+      uint32_t *tmp_begin = buffer_begin;
+      ConversionResult res =
+      ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
+                         reinterpret_cast<UTF8 const *>(begin),
+                         &buffer_begin,buffer_end,strictConversion);
+      if (res!=conversionOK) {
+        PP.Diag(Loc, diag::err_bad_character_encoding);
+        HadError = true;
        } else {
-        // Narrow character literals act as though their value is concatenated
-        // in this implementation, but warn on overflow.
-        if (LitVal.countLeadingZeros() < 8 && !Warned) {
-          PP.Diag(Loc, diag::warn_char_constant_too_large);
-          Warned = true;
+        for (; tmp_begin<buffer_begin; ++tmp_begin) {
+          if (*tmp_begin > largest_character_for_kind) {
+            HadError = true;
+            PP.Diag(Loc, diag::err_character_too_large);
+          }
          }
-        LitVal <<= 8;
        }
+
+      continue;
      }
+    // Is this a Universal Character Name excape?
+    if (begin[1] == 'u' || begin[1] == 'U') {
+      unsigned short UcnLen = 0;
+      if (!ProcessUCNEscape(begin, end, *buffer_begin, UcnLen,
+                            FullSourceLoc(Loc, PP.getSourceManager()),
+                            &PP.getDiagnostics(), PP.getLangOptions(),
+                            true))
+      {
+        HadError = true;
+      } else if (*buffer_begin > largest_character_for_kind) {
+        HadError = true;
+        PP.Diag(Loc,diag::err_character_too_large);
+      }
  
-    LitVal = LitVal + ResultChar;
-    ++NumCharsSoFar;
+      ++buffer_begin;
+      continue;
+    }
+    unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
+    uint64_t result =
+    ProcessCharEscape(begin, end, HadError,
+                      FullSourceLoc(Loc,PP.getSourceManager()),
+                      CharWidth, &PP.getDiagnostics());
+    *buffer_begin++ = result;
    }
  
-  // If this is the second character being processed, do special handling.
+  unsigned NumCharsSoFar = buffer_begin-&codepoint_buffer.front();
+
    if (NumCharsSoFar > 1) {
-    // Warn about discarding the top bits for multi-char wide-character
-    // constants (L'abcd').
-    if (!isAscii())
+    if (isWide())
        PP.Diag(Loc, diag::warn_extraneous_char_constant);
-    else if (NumCharsSoFar != 4)
+    else if (isAscii() && NumCharsSoFar == 4)
+      PP.Diag(Loc, diag::ext_four_char_character_literal);
+    else if (isAscii())
        PP.Diag(Loc, diag::ext_multichar_character_literal);
      else
-      PP.Diag(Loc, diag::ext_four_char_character_literal);
+      PP.Diag(Loc, diag::err_multichar_utf_character_literal);
      IsMultiChar = true;
    } else
      IsMultiChar = false;
  
+  llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
+
+  // Narrow character literals act as though their value is concatenated
+  // in this implementation, but warn on overflow.
+  bool multi_char_too_long = false;
+  if (isAscii() && isMultiChar()) {
+    LitVal = 0;
+    for (size_t i=0;i<NumCharsSoFar;++i) {
+      // check for enough leading zeros to shift into
+      multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
+      LitVal <<= 8;
+      LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
+    }
+  } else if (NumCharsSoFar > 0) {
+    // otherwise just take the last character
+    LitVal = buffer_begin[-1];
+  }
+
+  if (!HadError && multi_char_too_long) {
+    PP.Diag(Loc,diag::warn_char_constant_too_large);
+  }
+
    // Transfer the value from APInt to uint64_t
    Value = LitVal.getZExtValue();
author	Seth Cantrell <seth.cantrell@gmail.com>
	Wed, 18 Jan 2012 12:27:04 +0000 (12:27 +0000)
committer	Seth Cantrell <seth.cantrell@gmail.com>
	Wed, 18 Jan 2012 12:27:04 +0000 (12:27 +0000)
include/clang/Basic/DiagnosticLexKinds.td		patch \| blob \| history
lib/Lex/LiteralSupport.cpp		patch \| blob \| history