From: Richard Smith <richard-llvm@metafoo.co.uk>
Date: Sat, 8 Sep 2012 07:16:20 +0000 (+0000)
Subject: When a bad UTF-8 encoding or bogus escape sequence is encountered in a
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e5f0588840b20897631cc8110344fd2745ef4caa;p=clang

When a bad UTF-8 encoding or bogus escape sequence is encountered in a
string literal, produce a diagnostic pointing at the erroneous character
range, not at the start of the literal.


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@163459 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/clang/Basic/ConvertUTF.h b/include/clang/Basic/ConvertUTF.h
index 5be304b475..cdc42699e3 100644
--- a/include/clang/Basic/ConvertUTF.h
+++ b/include/clang/Basic/ConvertUTF.h
@@ -158,7 +158,9 @@ ConversionResult ConvertUTF32toUTF16 (
 
 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
 
-Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd);
+Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
+
+unsigned getNumBytesForUTF8(UTF8 firstByte);
 
 #ifdef __cplusplus
 }
@@ -174,11 +176,13 @@ namespace clang {
  * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on
  * WideCharWidth. The converted data is written to ResultPtr, which needs to
  * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success,
- * ResultPtr will point one after the end of the copied string.
+ * ResultPtr will point one after the end of the copied string. On failure,
+ * ResultPtr will not be changed, and ErrorPtr will be set to the location of
+ * the first character which could not be converted.
  * \return true on success.
  */
 bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
-                       char *&ResultPtr);
+                       char *&ResultPtr, const UTF8 *&ErrorPtr);
 
 /**
  * Convert an Unicode code point to UTF8 sequence.
diff --git a/include/clang/Lex/LiteralSupport.h b/include/clang/Lex/LiteralSupport.h
index bbce62d1d4..fec158d7a8 100644
--- a/include/clang/Lex/LiteralSupport.h
+++ b/include/clang/Lex/LiteralSupport.h
@@ -230,8 +230,8 @@ public:
 
 private:
   void init(const Token *StringToks, unsigned NumStringToks);
-  bool CopyStringFragment(StringRef Fragment);
-  bool DiagnoseBadString(const Token& Tok);
+  bool CopyStringFragment(const Token &Tok, const char *TokBegin,
+                          StringRef Fragment);
   void DiagnoseLexingError(SourceLocation Loc);
 };
 
diff --git a/lib/Basic/ConvertUTF.c b/lib/Basic/ConvertUTF.c
index 2e25e79c4c..ec57be701a 100644
--- a/lib/Basic/ConvertUTF.c
+++ b/lib/Basic/ConvertUTF.c
@@ -392,16 +392,26 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
 
 /* --------------------------------------------------------------------- */
 
+/*
+ * Exported function to return the total number of bytes in a codepoint
+ * represented in UTF-8, given the value of the first byte.
+ */
+unsigned getNumBytesForUTF8(UTF8 first) {
+  return trailingBytesForUTF8[first] + 1;
+}
+
+/* --------------------------------------------------------------------- */
+
 /*
  * Exported function to return whether a UTF-8 string is legal or not.
  * This is not used here; it's just exported.
  */
-Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd) {
-    while (source != sourceEnd) {
-        int length = trailingBytesForUTF8[*source] + 1;
-        if (length > sourceEnd - source || !isLegalUTF8(source, length))
+Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
+    while (*source != sourceEnd) {
+        int length = trailingBytesForUTF8[**source] + 1;
+        if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
             return false;
-        source += length;
+        *source += length;
     }
     return true;
 }
diff --git a/lib/Basic/ConvertUTFWrapper.cpp b/lib/Basic/ConvertUTFWrapper.cpp
index a1b3f7fd9d..6be3828d28 100644
--- a/lib/Basic/ConvertUTFWrapper.cpp
+++ b/lib/Basic/ConvertUTFWrapper.cpp
@@ -13,16 +13,19 @@
 namespace clang {
 
 bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
-                       char *&ResultPtr) {
+                       char *&ResultPtr, const UTF8 *&ErrorPtr) {
   assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4);
   ConversionResult result = conversionOK;
   // Copy the character span over.
   if (WideCharWidth == 1) {
-    if (!isLegalUTF8String(reinterpret_cast<const UTF8*>(Source.begin()),
-                           reinterpret_cast<const UTF8*>(Source.end())))
+    const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.begin());
+    if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) {
       result = sourceIllegal;
-    memcpy(ResultPtr, Source.data(), Source.size());
-    ResultPtr += Source.size();
+      ErrorPtr = Pos;
+    } else {
+      memcpy(ResultPtr, Source.data(), Source.size());
+      ResultPtr += Source.size();
+    }
   } else if (WideCharWidth == 2) {
     const UTF8 *sourceStart = (const UTF8*)Source.data();
     // FIXME: Make the type of the result buffer correct instead of
@@ -34,6 +37,8 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
         &targetStart, targetStart + 2*Source.size(), flags);
     if (result == conversionOK)
       ResultPtr = reinterpret_cast<char*>(targetStart);
+    else
+      ErrorPtr = sourceStart;
   } else if (WideCharWidth == 4) {
     const UTF8 *sourceStart = (const UTF8*)Source.data();
     // FIXME: Make the type of the result buffer correct instead of
@@ -45,6 +50,8 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
         &targetStart, targetStart + 4*Source.size(), flags);
     if (result == conversionOK)
       ResultPtr = reinterpret_cast<char*>(targetStart);
+    else
+      ErrorPtr = sourceStart;
   }
   assert((result != targetExhausted)
          && "ConvertUTF8toUTFXX exhausted target buffer");
@@ -67,4 +74,3 @@ bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
 }
 
 } // end namespace clang
-
diff --git a/lib/CodeGen/CGExpr.cpp b/lib/CodeGen/CGExpr.cpp
index 4fb81eb43c..9e172615f4 100644
--- a/lib/CodeGen/CGExpr.cpp
+++ b/lib/CodeGen/CGExpr.cpp
@@ -1851,8 +1851,9 @@ GetAddrOfConstantWideString(StringRef Str,
 static void ConvertUTF8ToWideString(unsigned CharByteWidth, StringRef Source,
                                     SmallString<32>& Target) {
   Target.resize(CharByteWidth * (Source.size() + 1));
-  char* ResultPtr = &Target[0];
-  bool success = ConvertUTF8toWide(CharByteWidth, Source, ResultPtr);
+  char *ResultPtr = &Target[0];
+  const UTF8 *ErrorPtr;
+  bool success = ConvertUTF8toWide(CharByteWidth, Source, ResultPtr, ErrorPtr);
   (void)success;
   assert(success);
   Target.resize(ResultPtr - &Target[0]);
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index 9e3c7786a7..be53429e6f 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -49,12 +49,35 @@ static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
   }
 }
 
+/// \brief Produce a diagnostic highlighting some portion of a literal.
+///
+/// Emits the diagnostic \p DiagID, highlighting the range of characters from
+/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
+/// a substring of a spelling buffer for the token beginning at \p TokBegin.
+static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
+                              const LangOptions &Features, FullSourceLoc TokLoc,
+                              const char *TokBegin, const char *TokRangeBegin,
+                              const char *TokRangeEnd, unsigned DiagID) {
+  SourceLocation Begin =
+    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
+                                   TokLoc.getManager(), Features);
+  SourceLocation End =
+    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
+                                   TokLoc.getManager(), Features);
+  return Diags->Report(Begin, DiagID)
+      << CharSourceRange::getCharRange(Begin, End);
+}
+
 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
 /// either a character or a string literal.
-static unsigned ProcessCharEscape(const char *&ThisTokBuf,
+static unsigned ProcessCharEscape(const char *ThisTokBegin,
+                                  const char *&ThisTokBuf,
                                   const char *ThisTokEnd, bool &HadError,
                                   FullSourceLoc Loc, unsigned CharWidth,
-                                  DiagnosticsEngine *Diags) {
+                                  DiagnosticsEngine *Diags,
+                                  const LangOptions &Features) {
+  const char *EscapeBegin = ThisTokBuf;
+
   // Skip the '\' char.
   ++ThisTokBuf;
 
@@ -75,12 +98,14 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
     break;
   case 'e':
     if (Diags)
-      Diags->Report(Loc, diag::ext_nonstandard_escape) << "e";
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_nonstandard_escape) << "e";
     ResultChar = 27;
     break;
   case 'E':
     if (Diags)
-      Diags->Report(Loc, diag::ext_nonstandard_escape) << "E";
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_nonstandard_escape) << "E";
     ResultChar = 27;
     break;
   case 'f':
@@ -102,7 +127,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
     ResultChar = 0;
     if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
       if (Diags)
-        Diags->Report(Loc, diag::err_hex_escape_no_digits);
+        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+             diag::err_hex_escape_no_digits);
       HadError = 1;
       break;
     }
@@ -126,7 +152,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
 
     // Check for overflow.
     if (Overflow && Diags)   // Too many digits to fit in
-      Diags->Report(Loc, diag::warn_hex_escape_too_large);
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::warn_hex_escape_too_large);
     break;
   }
   case '0': case '1': case '2': case '3':
@@ -148,7 +175,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
     // Check for overflow.  Reject '\777', but not L'\777'.
     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
       if (Diags)
-        Diags->Report(Loc, diag::warn_octal_escape_too_large);
+        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+             diag::warn_octal_escape_too_large);
       ResultChar &= ~0U >> (32-CharWidth);
     }
     break;
@@ -158,19 +186,22 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
   case '(': case '{': case '[': case '%':
     // GCC accepts these as extensions.  We warn about them as such though.
     if (Diags)
-      Diags->Report(Loc, diag::ext_nonstandard_escape)
-        << std::string()+(char)ResultChar;
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_nonstandard_escape)
+        << std::string(1, ResultChar);
     break;
   default:
     if (Diags == 0)
       break;
-      
+
     if (isgraph(ResultChar))
-      Diags->Report(Loc, diag::ext_unknown_escape)
-        << std::string()+(char)ResultChar;
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_unknown_escape)
+        << std::string(1, ResultChar);
     else
-      Diags->Report(Loc, diag::ext_unknown_escape)
-        << "x"+llvm::utohexstr(ResultChar);
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_unknown_escape)
+        << "x" + llvm::utohexstr(ResultChar);
     break;
   }
 
@@ -185,9 +216,6 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
                              FullSourceLoc Loc, DiagnosticsEngine *Diags, 
                              const LangOptions &Features,
                              bool in_char_string_literal = false) {
-  if (!Features.CPlusPlus && !Features.C99 && Diags)
-    Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
-
   const char *UcnBegin = ThisTokBuf;
 
   // Skip the '\u' char's.
@@ -195,7 +223,8 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
 
   if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
     if (Diags)
-      Diags->Report(Loc, diag::err_ucn_escape_no_digits);
+      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+           diag::err_ucn_escape_no_digits);
     return false;
   }
   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
@@ -208,12 +237,9 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
   }
   // If we didn't consume the proper number of digits, there is a problem.
   if (UcnLenSave) {
-    if (Diags) {
-      SourceLocation L =
-        Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
-                                       Loc.getManager(), Features);
-      Diags->Report(L, diag::err_ucn_escape_incomplete);
-    }
+    if (Diags)
+      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+           diag::err_ucn_escape_incomplete);
     return false;
   }
 
@@ -221,7 +247,8 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
   if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
       UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
     if (Diags)
-      Diags->Report(Loc, diag::err_ucn_escape_invalid);
+      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+           diag::err_ucn_escape_invalid);
     return false;
   }
 
@@ -231,22 +258,25 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {  // $, @, `
     bool IsError = (!Features.CPlusPlus0x || !in_char_string_literal);
     if (Diags) {
-      SourceLocation UcnBeginLoc =
-        Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
-                                       Loc.getManager(), Features);
       char BasicSCSChar = UcnVal;
       if (UcnVal >= 0x20 && UcnVal < 0x7f)
-        Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_escape_basic_scs :
-                      diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
-          << StringRef(&BasicSCSChar, 1);
+        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+             IsError ? diag::err_ucn_escape_basic_scs :
+                       diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
+            << StringRef(&BasicSCSChar, 1);
       else
-        Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_control_character :
-                      diag::warn_cxx98_compat_literal_ucn_control_character);
+        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+             IsError ? diag::err_ucn_control_character :
+                       diag::warn_cxx98_compat_literal_ucn_control_character);
     }
     if (IsError)
       return false;
   }
 
+  if (!Features.CPlusPlus && !Features.C99 && Diags)
+    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+         diag::warn_ucn_not_valid_in_c89);
+
   return true;
 }
 
@@ -943,7 +973,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
         HadError = true;
       } else if (*buffer_begin > largest_character_for_kind) {
         HadError = true;
-        PP.Diag(Loc,diag::err_character_too_large);
+        PP.Diag(Loc, diag::err_character_too_large);
       }
 
       ++buffer_begin;
@@ -951,9 +981,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
     }
     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
     uint64_t result =
-    ProcessCharEscape(begin, end, HadError,
-                      FullSourceLoc(Loc,PP.getSourceManager()),
-                      CharWidth, &PP.getDiagnostics());
+      ProcessCharEscape(TokBegin, begin, end, HadError,
+                        FullSourceLoc(Loc,PP.getSourceManager()),
+                        CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
     *buffer_begin++ = result;
   }
 
@@ -1110,7 +1140,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
         Kind = StringToks[i].getKind();
       } else {
         if (Diags)
-          Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
+          Diags->Report(StringToks[i].getLocation(),
                         diag::err_unsupported_string_concat);
         hadError = true;
       }
@@ -1218,9 +1248,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
       assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
 
       // Copy the string over
-      if (CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
-        if (DiagnoseBadString(StringToks[i]))
-          hadError = true;
+      if (CopyStringFragment(StringToks[i], ThisTokBegin,
+                             StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
+        hadError = true;
     } else {
       if (ThisTokBuf[0] != '"') {
         // The file may have come from PCH and then changed after loading the
@@ -1251,9 +1281,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
 
           // Copy the character span over.
-          if (CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)))
-            if (DiagnoseBadString(StringToks[i]))
-              hadError = true;
+          if (CopyStringFragment(StringToks[i], ThisTokBegin,
+                                 StringRef(InStart, ThisTokBuf - InStart)))
+            hadError = true;
           continue;
         }
         // Is this a Universal Character Name escape?
@@ -1266,9 +1296,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
         }
         // Otherwise, this is a non-UCN escape character.  Process it.
         unsigned ResultChar =
-          ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
+          ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
                             FullSourceLoc(StringToks[i].getLocation(), SM),
-                            CharByteWidth*8, Diags);
+                            CharByteWidth*8, Diags, Features);
 
         if (CharByteWidth == 4) {
           // FIXME: Make the type of the result buffer correct instead of
@@ -1308,8 +1338,8 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
 
     // Verify that pascal strings aren't too large.
     if (GetStringLength() > 256) {
-      if (Diags) 
-        Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
+      if (Diags)
+        Diags->Report(StringToks[0].getLocation(),
                       diag::err_pascal_string_too_long)
           << SourceRange(StringToks[0].getLocation(),
                          StringToks[NumStringToks-1].getLocation());
@@ -1321,7 +1351,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
     
     if (GetNumStringChars() > MaxChars)
-      Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
+      Diags->Report(StringToks[0].getLocation(),
                     diag::ext_string_too_long)
         << GetNumStringChars() << MaxChars
         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
@@ -1330,21 +1360,32 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
   }
 }
 
-/// copyStringFragment - This function copies from Start to End into ResultPtr.
+/// \brief This function copies from Fragment, which is a sequence of bytes
+/// within Tok's contents (which begin at TokBegin) into ResultPtr.
 /// Performs widening for multi-byte characters.
-bool StringLiteralParser::CopyStringFragment(StringRef Fragment) {
-  return !ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr);
-}
+bool StringLiteralParser::CopyStringFragment(const Token &Tok,
+                                             const char *TokBegin,
+                                             StringRef Fragment) {
+  const UTF8 *ErrorPtrTmp;
+  if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
+    return false;
+  const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
 
-bool StringLiteralParser::DiagnoseBadString(const Token &Tok) {
   // If we see bad encoding for unprefixed string literals, warn and
   // simply copy the byte values, for compatibility with gcc and older
   // versions of clang.
   bool NoErrorOnBadEncoding = isAscii();
-  unsigned Msg = NoErrorOnBadEncoding ? diag::warn_bad_string_encoding :
-                                        diag::err_bad_string_encoding;
-  if (Diags)
-    Diags->Report(FullSourceLoc(Tok.getLocation(), SM), Msg);
+  if (NoErrorOnBadEncoding) {
+    memcpy(ResultPtr, Fragment.data(), Fragment.size());
+    ResultPtr += Fragment.size();
+  }
+  if (Diags) {
+    Diag(Diags, Features, FullSourceLoc(Tok.getLocation(), SM), TokBegin,
+         ErrorPtr, ErrorPtr + std::min<unsigned>(getNumBytesForUTF8(*ErrorPtr),
+                                                 Fragment.end() - ErrorPtr),
+         NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
+                              : diag::err_bad_string_encoding);
+  }
   return !NoErrorOnBadEncoding;
 }
 
@@ -1422,9 +1463,9 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
       }
       ByteNo -= Len;
     } else {
-      ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
+      ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
                         FullSourceLoc(Tok.getLocation(), SM),
-                        CharByteWidth*8, Diags);
+                        CharByteWidth*8, Diags, Features);
       --ByteNo;
     }
     assert(!HadError && "This method isn't valid on erroneous strings");
diff --git a/test/Lexer/string-literal-errors.cpp b/test/Lexer/string-literal-errors.cpp
new file mode 100644
index 0000000000..be574c2a55
--- /dev/null
+++ b/test/Lexer/string-literal-errors.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -fsyntax-only %s 2>&1 | FileCheck -strict-whitespace %s
+
+void foo() {
+  (void)"\q \u123z \x \U \U123 \U12345 \u123 \xyzzy \777 \U"
+  // CHECK: {{^  \(void\)"\\q \\u123z \\x \\U \\U123 \\U12345 \\u123 \\xyzzy \\777 \\U"$}}
+  //
+  //              (void)"\q \u123z \x \U \U123 \U12345 \u123 \xyzzy \777 \U"
+  // CHECK: {{^         \^~$}}
+  // CHECK: {{^            \^~~~~$}}
+  // CHECK: {{^                   \^~$}}
+  // CHECK: {{^                      \^~$}}
+  // CHECK: {{^                         \^~~~~$}}
+  // CHECK: {{^                               \^~~~~~~$}}
+  // CHECK: {{^                                       \^~~~~$}}
+  // CHECK: {{^                                             \^~$}}
+  // CHECK: {{^                                                    \^~~~$}}
+  // CHECK: {{^                                                         \^~$}}
+
+  "123 \x \z";
+  // CHECK: {{^  "123 \\x \\z";$}}
+  //
+  //              "123 \x \z";
+  // CHECK: {{^       \^~$}}
+  // CHECK: {{^          \^~$}}
+}
diff --git a/test/Misc/wrong-encoding.c b/test/Misc/wrong-encoding.c
index bd1cf3dc02..476c783c24 100644
--- a/test/Misc/wrong-encoding.c
+++ b/test/Misc/wrong-encoding.c
@@ -4,12 +4,12 @@ void foo() {
 
   "§Ã"; // ø
 // CHECK: {{^  "<A7><C3>"; // <F8>}}
-// CHECK: {{^  \^}}
+// CHECK: {{^   \^~~~}}
 
   /* þ« */ const char *d = "¥";
 
 // CHECK: {{^  /\* <FE><AB> \*/ const char \*d = "<A5>";}}
-// CHECK: {{^                                 \^}}
+// CHECK: {{^                                  \^~~~}}
 
 // CHECK: {{^  "<A7><C3>"; // <F8>}}
 // CHECK: {{^  \^~~~~~~~~~}}