When a bad UTF-8 encoding or bogus escape sequence is encountered in a

author Richard Smith <richard-llvm@metafoo.co.uk>

Sat, 8 Sep 2012 07:16:20 +0000 (07:16 +0000)

committer Richard Smith <richard-llvm@metafoo.co.uk>

Sat, 8 Sep 2012 07:16:20 +0000 (07:16 +0000)
author Richard Smith <richard-llvm@metafoo.co.uk>
Sat, 8 Sep 2012 07:16:20 +0000 (07:16 +0000)
committer Richard Smith <richard-llvm@metafoo.co.uk>
Sat, 8 Sep 2012 07:16:20 +0000 (07:16 +0000)
diff --git a/include/clang/Basic/ConvertUTF.h b/include/clang/Basic/ConvertUTF.h

index 5be304b475b3421adb4c0d00aea2ad99f044f07d..cdc42699e3eb212c1f364121841aa53d787c1dac 100644 (file)
--- a/include/clang/Basic/ConvertUTF.h
+++ b/include/clang/Basic/ConvertUTF.h
@@ -158,7 +158,9 @@ ConversionResult ConvertUTF32toUTF16 (
  
  Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
  
-Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd);
+Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
+
+unsigned getNumBytesForUTF8(UTF8 firstByte);
  
  #ifdef __cplusplus
  }
@@ -174,11 +176,13 @@ namespace clang {
   * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on
   * WideCharWidth. The converted data is written to ResultPtr, which needs to
   * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success,
- * ResultPtr will point one after the end of the copied string.
+ * ResultPtr will point one after the end of the copied string. On failure,
+ * ResultPtr will not be changed, and ErrorPtr will be set to the location of
+ * the first character which could not be converted.
   * \return true on success.
   */
  bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
-                       char *&ResultPtr);
+                       char *&ResultPtr, const UTF8 *&ErrorPtr);
  
  /**
   * Convert an Unicode code point to UTF8 sequence.
diff --git a/include/clang/Lex/LiteralSupport.h b/include/clang/Lex/LiteralSupport.h

index bbce62d1d48a345121dfbc7e7fb0246b8ceb12d8..fec158d7a8983291cec6d3954eff96f652a177d0 100644 (file)
--- a/include/clang/Lex/LiteralSupport.h
+++ b/include/clang/Lex/LiteralSupport.h
@@ -230,8 +230,8 @@ public:
  
  private:
    void init(const Token *StringToks, unsigned NumStringToks);
-  bool CopyStringFragment(StringRef Fragment);
-  bool DiagnoseBadString(const Token& Tok);
+  bool CopyStringFragment(const Token &Tok, const char *TokBegin,
+                          StringRef Fragment);
    void DiagnoseLexingError(SourceLocation Loc);
  };
  
diff --git a/lib/Basic/ConvertUTF.c b/lib/Basic/ConvertUTF.c

index 2e25e79c4c6e14dfef1fb75344bbe0c33b7d1f66..ec57be701add9f8e8e0e0218ca64ab9650f443f3 100644 (file)
--- a/lib/Basic/ConvertUTF.c
+++ b/lib/Basic/ConvertUTF.c
@@ -392,16 +392,26 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
  
  /* --------------------------------------------------------------------- */
  
+/*
+ * Exported function to return the total number of bytes in a codepoint
+ * represented in UTF-8, given the value of the first byte.
+ */
+unsigned getNumBytesForUTF8(UTF8 first) {
+  return trailingBytesForUTF8[first] + 1;
+}
+
+/* --------------------------------------------------------------------- */
+
  /*
   * Exported function to return whether a UTF-8 string is legal or not.
   * This is not used here; it's just exported.
   */
-Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd) {
-    while (source != sourceEnd) {
-        int length = trailingBytesForUTF8[*source] + 1;
-        if (length > sourceEnd - source || !isLegalUTF8(source, length))
+Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
+    while (*source != sourceEnd) {
+        int length = trailingBytesForUTF8[**source] + 1;
+        if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
              return false;
-        source += length;
+        *source += length;
      }
      return true;
  }
diff --git a/lib/Basic/ConvertUTFWrapper.cpp b/lib/Basic/ConvertUTFWrapper.cpp

index a1b3f7fd9da36c0669375962c10b81630909de9f..6be3828d2868a20f0e8eda5361c575a3b7dd7c38 100644 (file)
--- a/lib/Basic/ConvertUTFWrapper.cpp
+++ b/lib/Basic/ConvertUTFWrapper.cpp
@@ -13,16 +13,19 @@
  namespace clang {
  
  bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
-                       char *&ResultPtr) {
+                       char *&ResultPtr, const UTF8 *&ErrorPtr) {
    assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4);
    ConversionResult result = conversionOK;
    // Copy the character span over.
    if (WideCharWidth == 1) {
-    if (!isLegalUTF8String(reinterpret_cast<const UTF8*>(Source.begin()),
-                           reinterpret_cast<const UTF8*>(Source.end())))
+    const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.begin());
+    if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) {
        result = sourceIllegal;
-    memcpy(ResultPtr, Source.data(), Source.size());
-    ResultPtr += Source.size();
+      ErrorPtr = Pos;
+    } else {
+      memcpy(ResultPtr, Source.data(), Source.size());
+      ResultPtr += Source.size();
+    }
    } else if (WideCharWidth == 2) {
      const UTF8 *sourceStart = (const UTF8*)Source.data();
      // FIXME: Make the type of the result buffer correct instead of
@@ -34,6 +37,8 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
          &targetStart, targetStart + 2*Source.size(), flags);
      if (result == conversionOK)
        ResultPtr = reinterpret_cast<char*>(targetStart);
+    else
+      ErrorPtr = sourceStart;
    } else if (WideCharWidth == 4) {
      const UTF8 *sourceStart = (const UTF8*)Source.data();
      // FIXME: Make the type of the result buffer correct instead of
@@ -45,6 +50,8 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
          &targetStart, targetStart + 4*Source.size(), flags);
      if (result == conversionOK)
        ResultPtr = reinterpret_cast<char*>(targetStart);
+    else
+      ErrorPtr = sourceStart;
    }
    assert((result != targetExhausted)
           && "ConvertUTF8toUTFXX exhausted target buffer");
@@ -67,4 +74,3 @@ bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
  }
  
  } // end namespace clang
-
diff --git a/lib/CodeGen/CGExpr.cpp b/lib/CodeGen/CGExpr.cpp

index 4fb81eb43ce6a847e32cd5a408f9d6e6437fa052..9e172615f4ff67f1d1bc560b7efe19e17a0dea70 100644 (file)
--- a/lib/CodeGen/CGExpr.cpp
+++ b/lib/CodeGen/CGExpr.cpp
@@ -1851,8 +1851,9 @@ GetAddrOfConstantWideString(StringRef Str,
  static void ConvertUTF8ToWideString(unsigned CharByteWidth, StringRef Source,
                                      SmallString<32>& Target) {
    Target.resize(CharByteWidth * (Source.size() + 1));
-  char* ResultPtr = &Target[0];
-  bool success = ConvertUTF8toWide(CharByteWidth, Source, ResultPtr);
+  char *ResultPtr = &Target[0];
+  const UTF8 *ErrorPtr;
+  bool success = ConvertUTF8toWide(CharByteWidth, Source, ResultPtr, ErrorPtr);
    (void)success;
    assert(success);
    Target.resize(ResultPtr - &Target[0]);
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp

index 9e3c7786a7323df02a102c549eeeaebc5da481b1..be53429e6f2d8da5f37875dcf02d74c00aaaa199 100644 (file)
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -49,12 +49,35 @@ static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
    }
  }
  
+/// \brief Produce a diagnostic highlighting some portion of a literal.
+///
+/// Emits the diagnostic \p DiagID, highlighting the range of characters from
+/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
+/// a substring of a spelling buffer for the token beginning at \p TokBegin.
+static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
+                              const LangOptions &Features, FullSourceLoc TokLoc,
+                              const char *TokBegin, const char *TokRangeBegin,
+                              const char *TokRangeEnd, unsigned DiagID) {
+  SourceLocation Begin =
+    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
+                                   TokLoc.getManager(), Features);
+  SourceLocation End =
+    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
+                                   TokLoc.getManager(), Features);
+  return Diags->Report(Begin, DiagID)
+      << CharSourceRange::getCharRange(Begin, End);
+}
+
  /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
  /// either a character or a string literal.
-static unsigned ProcessCharEscape(const char *&ThisTokBuf,
+static unsigned ProcessCharEscape(const char *ThisTokBegin,
+                                  const char *&ThisTokBuf,
                                    const char *ThisTokEnd, bool &HadError,
                                    FullSourceLoc Loc, unsigned CharWidth,
-                                  DiagnosticsEngine *Diags) {
+                                  DiagnosticsEngine *Diags,
+                                  const LangOptions &Features) {
+  const char *EscapeBegin = ThisTokBuf;
+
    // Skip the '\' char.
    ++ThisTokBuf;
  
@@ -75,12 +98,14 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
      break;
    case 'e':
      if (Diags)
-      Diags->Report(Loc, diag::ext_nonstandard_escape) << "e";
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_nonstandard_escape) << "e";
      ResultChar = 27;
      break;
    case 'E':
      if (Diags)
-      Diags->Report(Loc, diag::ext_nonstandard_escape) << "E";
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_nonstandard_escape) << "E";
      ResultChar = 27;
      break;
    case 'f':
@@ -102,7 +127,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
      ResultChar = 0;
      if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
        if (Diags)
-        Diags->Report(Loc, diag::err_hex_escape_no_digits);
+        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+             diag::err_hex_escape_no_digits);
        HadError = 1;
        break;
      }
@@ -126,7 +152,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
  
      // Check for overflow.
      if (Overflow && Diags)   // Too many digits to fit in
-      Diags->Report(Loc, diag::warn_hex_escape_too_large);
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::warn_hex_escape_too_large);
      break;
    }
    case '0': case '1': case '2': case '3':
@@ -148,7 +175,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
      // Check for overflow.  Reject '\777', but not L'\777'.
      if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
        if (Diags)
-        Diags->Report(Loc, diag::warn_octal_escape_too_large);
+        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+             diag::warn_octal_escape_too_large);
        ResultChar &= ~0U >> (32-CharWidth);
      }
      break;
@@ -158,19 +186,22 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
    case '(': case '{': case '[': case '%':
      // GCC accepts these as extensions.  We warn about them as such though.
      if (Diags)
-      Diags->Report(Loc, diag::ext_nonstandard_escape)
-        << std::string()+(char)ResultChar;
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_nonstandard_escape)
+        << std::string(1, ResultChar);
      break;
    default:
      if (Diags == 0)
        break;
-      
+
      if (isgraph(ResultChar))
-      Diags->Report(Loc, diag::ext_unknown_escape)
-        << std::string()+(char)ResultChar;
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_unknown_escape)
+        << std::string(1, ResultChar);
      else
-      Diags->Report(Loc, diag::ext_unknown_escape)
-        << "x"+llvm::utohexstr(ResultChar);
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_unknown_escape)
+        << "x" + llvm::utohexstr(ResultChar);
      break;
    }
  
@@ -185,9 +216,6 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
                               FullSourceLoc Loc, DiagnosticsEngine *Diags, 
                               const LangOptions &Features,
                               bool in_char_string_literal = false) {
-  if (!Features.CPlusPlus && !Features.C99 && Diags)
-    Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
-
    const char *UcnBegin = ThisTokBuf;
  
    // Skip the '\u' char's.
@@ -195,7 +223,8 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
  
    if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
      if (Diags)
-      Diags->Report(Loc, diag::err_ucn_escape_no_digits);
+      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+           diag::err_ucn_escape_no_digits);
      return false;
    }
    UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
@@ -208,12 +237,9 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
    }
    // If we didn't consume the proper number of digits, there is a problem.
    if (UcnLenSave) {
-    if (Diags) {
-      SourceLocation L =
-        Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
-                                       Loc.getManager(), Features);
-      Diags->Report(L, diag::err_ucn_escape_incomplete);
-    }
+    if (Diags)
+      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+           diag::err_ucn_escape_incomplete);
      return false;
    }
  
@@ -221,7 +247,8 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
    if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
        UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
      if (Diags)
-      Diags->Report(Loc, diag::err_ucn_escape_invalid);
+      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+           diag::err_ucn_escape_invalid);
      return false;
    }
  
@@ -231,22 +258,25 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
        (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {  // $, @, `
      bool IsError = (!Features.CPlusPlus0x || !in_char_string_literal);
      if (Diags) {
-      SourceLocation UcnBeginLoc =
-        Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
-                                       Loc.getManager(), Features);
        char BasicSCSChar = UcnVal;
        if (UcnVal >= 0x20 && UcnVal < 0x7f)
-        Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_escape_basic_scs :
-                      diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
-          << StringRef(&BasicSCSChar, 1);
+        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+             IsError ? diag::err_ucn_escape_basic_scs :
+                       diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
+            << StringRef(&BasicSCSChar, 1);
        else
-        Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_control_character :
-                      diag::warn_cxx98_compat_literal_ucn_control_character);
+        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+             IsError ? diag::err_ucn_control_character :
+                       diag::warn_cxx98_compat_literal_ucn_control_character);
      }
      if (IsError)
        return false;
    }
  
+  if (!Features.CPlusPlus && !Features.C99 && Diags)
+    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+         diag::warn_ucn_not_valid_in_c89);
+
    return true;
  }
  
@@ -943,7 +973,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
          HadError = true;
        } else if (*buffer_begin > largest_character_for_kind) {
          HadError = true;
-        PP.Diag(Loc,diag::err_character_too_large);
+        PP.Diag(Loc, diag::err_character_too_large);
        }
  
        ++buffer_begin;
@@ -951,9 +981,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
      }
      unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
      uint64_t result =
-    ProcessCharEscape(begin, end, HadError,
-                      FullSourceLoc(Loc,PP.getSourceManager()),
-                      CharWidth, &PP.getDiagnostics());
+      ProcessCharEscape(TokBegin, begin, end, HadError,
+                        FullSourceLoc(Loc,PP.getSourceManager()),
+                        CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
      *buffer_begin++ = result;
    }
  
@@ -1110,7 +1140,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
          Kind = StringToks[i].getKind();
        } else {
          if (Diags)
-          Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
+          Diags->Report(StringToks[i].getLocation(),
                          diag::err_unsupported_string_concat);
          hadError = true;
        }
@@ -1218,9 +1248,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
        assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
  
        // Copy the string over
-      if (CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
-        if (DiagnoseBadString(StringToks[i]))
-          hadError = true;
+      if (CopyStringFragment(StringToks[i], ThisTokBegin,
+                             StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
+        hadError = true;
      } else {
        if (ThisTokBuf[0] != '"') {
          // The file may have come from PCH and then changed after loading the
@@ -1251,9 +1281,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
            } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
  
            // Copy the character span over.
-          if (CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)))
-            if (DiagnoseBadString(StringToks[i]))
-              hadError = true;
+          if (CopyStringFragment(StringToks[i], ThisTokBegin,
+                                 StringRef(InStart, ThisTokBuf - InStart)))
+            hadError = true;
            continue;
          }
          // Is this a Universal Character Name escape?
@@ -1266,9 +1296,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
          }
          // Otherwise, this is a non-UCN escape character.  Process it.
          unsigned ResultChar =
-          ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
+          ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
                              FullSourceLoc(StringToks[i].getLocation(), SM),
-                            CharByteWidth*8, Diags);
+                            CharByteWidth*8, Diags, Features);
  
          if (CharByteWidth == 4) {
            // FIXME: Make the type of the result buffer correct instead of
@@ -1308,8 +1338,8 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
  
      // Verify that pascal strings aren't too large.
      if (GetStringLength() > 256) {
-      if (Diags) 
-        Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
+      if (Diags)
+        Diags->Report(StringToks[0].getLocation(),
                        diag::err_pascal_string_too_long)
            << SourceRange(StringToks[0].getLocation(),
                           StringToks[NumStringToks-1].getLocation());
@@ -1321,7 +1351,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
      unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
      
      if (GetNumStringChars() > MaxChars)
-      Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
+      Diags->Report(StringToks[0].getLocation(),
                      diag::ext_string_too_long)
          << GetNumStringChars() << MaxChars
          << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
@@ -1330,21 +1360,32 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
    }
  }
  
-/// copyStringFragment - This function copies from Start to End into ResultPtr.
+/// \brief This function copies from Fragment, which is a sequence of bytes
+/// within Tok's contents (which begin at TokBegin) into ResultPtr.
  /// Performs widening for multi-byte characters.
-bool StringLiteralParser::CopyStringFragment(StringRef Fragment) {
-  return !ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr);
-}
+bool StringLiteralParser::CopyStringFragment(const Token &Tok,
+                                             const char *TokBegin,
+                                             StringRef Fragment) {
+  const UTF8 *ErrorPtrTmp;
+  if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
+    return false;
+  const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
  
-bool StringLiteralParser::DiagnoseBadString(const Token &Tok) {
    // If we see bad encoding for unprefixed string literals, warn and
    // simply copy the byte values, for compatibility with gcc and older
    // versions of clang.
    bool NoErrorOnBadEncoding = isAscii();
-  unsigned Msg = NoErrorOnBadEncoding ? diag::warn_bad_string_encoding :
-                                        diag::err_bad_string_encoding;
-  if (Diags)
-    Diags->Report(FullSourceLoc(Tok.getLocation(), SM), Msg);
+  if (NoErrorOnBadEncoding) {
+    memcpy(ResultPtr, Fragment.data(), Fragment.size());
+    ResultPtr += Fragment.size();
+  }
+  if (Diags) {
+    Diag(Diags, Features, FullSourceLoc(Tok.getLocation(), SM), TokBegin,
+         ErrorPtr, ErrorPtr + std::min<unsigned>(getNumBytesForUTF8(*ErrorPtr),
+                                                 Fragment.end() - ErrorPtr),
+         NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
+                              : diag::err_bad_string_encoding);
+  }
    return !NoErrorOnBadEncoding;
  }
  
@@ -1422,9 +1463,9 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
        }
        ByteNo -= Len;
      } else {
-      ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
+      ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
                          FullSourceLoc(Tok.getLocation(), SM),
-                        CharByteWidth*8, Diags);
+                        CharByteWidth*8, Diags, Features);
        --ByteNo;
      }
      assert(!HadError && "This method isn't valid on erroneous strings");
diff --git a/test/Lexer/string-literal-errors.cpp b/test/Lexer/string-literal-errors.cpp

new file mode 100644 (file)

index 0000000..be574c2
--- /dev/null
+++ b/test/Lexer/string-literal-errors.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -fsyntax-only %s 2>&1 | FileCheck -strict-whitespace %s
+
+void foo() {
+  (void)"\q \u123z \x \U \U123 \U12345 \u123 \xyzzy \777 \U"
+  // CHECK: {{^  \(void\)"\\q \\u123z \\x \\U \\U123 \\U12345 \\u123 \\xyzzy \\777 \\U"$}}
+  //
+  //              (void)"\q \u123z \x \U \U123 \U12345 \u123 \xyzzy \777 \U"
+  // CHECK: {{^         \^~$}}
+  // CHECK: {{^            \^~~~~$}}
+  // CHECK: {{^                   \^~$}}
+  // CHECK: {{^                      \^~$}}
+  // CHECK: {{^                         \^~~~~$}}
+  // CHECK: {{^                               \^~~~~~~$}}
+  // CHECK: {{^                                       \^~~~~$}}
+  // CHECK: {{^                                             \^~$}}
+  // CHECK: {{^                                                    \^~~~$}}
+  // CHECK: {{^                                                         \^~$}}
+
+  "123 \x \z";
+  // CHECK: {{^  "123 \\x \\z";$}}
+  //
+  //              "123 \x \z";
+  // CHECK: {{^       \^~$}}
+  // CHECK: {{^          \^~$}}
+}
diff --git a/test/Misc/wrong-encoding.c b/test/Misc/wrong-encoding.c

index bd1cf3dc02aec239b3807b3e0af3b2656708390a..476c783c24eba0ab4cc82da728a916895f48bfc5 100644 (file)
--- a/test/Misc/wrong-encoding.c
+++ b/test/Misc/wrong-encoding.c
@@ -4,12 +4,12 @@ void foo() {
  
    "§Ã"; // ø
  // CHECK: {{^  "<A7><C3>"; // <F8>}}
-// CHECK: {{^  \^}}
+// CHECK: {{^   \^~~~}}
  
    /* þ« */ const char *d = "¥";
  
  // CHECK: {{^  /\* <FE><AB> \*/ const char \*d = "<A5>";}}
-// CHECK: {{^                                 \^}}
+// CHECK: {{^                                  \^~~~}}
  
  // CHECK: {{^  "<A7><C3>"; // <F8>}}
  // CHECK: {{^  \^~~~~~~~~~}}
author	Richard Smith <richard-llvm@metafoo.co.uk>
	Sat, 8 Sep 2012 07:16:20 +0000 (07:16 +0000)
committer	Richard Smith <richard-llvm@metafoo.co.uk>
	Sat, 8 Sep 2012 07:16:20 +0000 (07:16 +0000)
include/clang/Basic/ConvertUTF.h		patch \| blob \| history
include/clang/Lex/LiteralSupport.h		patch \| blob \| history
lib/Basic/ConvertUTF.c		patch \| blob \| history
lib/Basic/ConvertUTFWrapper.cpp		patch \| blob \| history
lib/CodeGen/CGExpr.cpp		patch \| blob \| history
lib/Lex/LiteralSupport.cpp		patch \| blob \| history
test/Lexer/string-literal-errors.cpp	[new file with mode: 0644]	patch \| blob
test/Misc/wrong-encoding.c		patch \| blob \| history