From f74a4587629615ffd13bd0724868f86ba8c8f27b Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Tue, 1 Nov 2011 02:14:50 +0000 Subject: [PATCH] Perform proper conversion for strings encoded in the source file as UTF-8. (For now, we are assuming the source character set is always UTF-8; this can be easily extended if necessary.) Tests will be coming up in a subsequent commit. Patch by Seth Cantrell. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@143416 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/DiagnosticLexKinds.td | 2 + include/clang/Lex/LiteralSupport.h | 2 +- lib/Lex/LiteralSupport.cpp | 55 ++++++++++++++++++----- 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td index 5206fd2c27..f1e8c75d6b 100644 --- a/include/clang/Basic/DiagnosticLexKinds.td +++ b/include/clang/Basic/DiagnosticLexKinds.td @@ -130,6 +130,8 @@ def warn_cxx98_compat_unicode_literal : Warning< InGroup, DefaultIgnore; def err_unsupported_string_concat : Error< "unsupported non-standard concatenation of string literals">; +def err_bad_string_encoding : Error< + "illegal sequence in string literal">; //===----------------------------------------------------------------------===// // PTH Diagnostics diff --git a/include/clang/Lex/LiteralSupport.h b/include/clang/Lex/LiteralSupport.h index b33092c753..6f378041ae 100644 --- a/include/clang/Lex/LiteralSupport.h +++ b/include/clang/Lex/LiteralSupport.h @@ -197,7 +197,7 @@ public: private: void init(const Token *StringToks, unsigned NumStringToks); - void CopyStringFragment(StringRef Fragment); + bool CopyStringFragment(StringRef Fragment); }; } // end namespace clang diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp index 70183fd1a0..b107531e14 100644 --- a/lib/Lex/LiteralSupport.cpp +++ b/lib/Lex/LiteralSupport.cpp @@ -16,6 +16,7 @@ #include "clang/Lex/Preprocessor.h" #include "clang/Lex/LexDiagnostic.h" #include "clang/Basic/TargetInfo.h" +#include "clang/Basic/ConvertUTF.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/ErrorHandling.h" using namespace clang; @@ -1033,7 +1034,14 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ ThisTokEnd -= (ThisTokBuf - Prefix); // Copy the string over - CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)); + if (CopyStringFragment(StringRef(ThisTokBuf,ThisTokEnd-ThisTokBuf))) + { + if (Diags) + Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM), + diag::err_bad_string_encoding); + hadError = true; + } + } else { assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); ++ThisTokBuf; // skip " @@ -1060,7 +1068,13 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); // Copy the character span over. - CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)); + if (CopyStringFragment(StringRef(InStart,ThisTokBuf-InStart))) + { + if (Diags) + Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM), + diag::err_bad_string_encoding); + hadError = true; + } continue; } // Is this a Universal Character Name escape? @@ -1116,20 +1130,39 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ /// copyStringFragment - This function copies from Start to End into ResultPtr. /// Performs widening for multi-byte characters. -void StringLiteralParser::CopyStringFragment(StringRef Fragment) { +bool StringLiteralParser::CopyStringFragment(StringRef Fragment) { + assert(CharByteWidth==1 || CharByteWidth==2 || CharByteWidth==4); + ConversionResult result = conversionOK; // Copy the character span over. if (CharByteWidth == 1) { memcpy(ResultPtr, Fragment.data(), Fragment.size()); ResultPtr += Fragment.size(); - } else { - // Note: our internal rep of wide char tokens is always little-endian. - for (StringRef::iterator I=Fragment.begin(), E=Fragment.end(); I!=E; ++I) { - *ResultPtr++ = *I; - // Add zeros at the end. - for (unsigned i = 1, e = CharByteWidth; i != e; ++i) - *ResultPtr++ = 0; - } + } else if (CharByteWidth == 2) { + UTF8 const *sourceStart = (UTF8 const *)Fragment.data(); + // FIXME: Make the type of the result buffer correct instead of + // using reinterpret_cast. + UTF16 *targetStart = reinterpret_cast(ResultPtr); + ConversionFlags flags = lenientConversion; + result = ConvertUTF8toUTF16( + &sourceStart,sourceStart + Fragment.size(), + &targetStart,targetStart + 2*Fragment.size(),flags); + if (result==conversionOK) + ResultPtr = reinterpret_cast(targetStart); + } else if (CharByteWidth == 4) { + UTF8 const *sourceStart = (UTF8 const *)Fragment.data(); + // FIXME: Make the type of the result buffer correct instead of + // using reinterpret_cast. + UTF32 *targetStart = reinterpret_cast(ResultPtr); + ConversionFlags flags = lenientConversion; + result = ConvertUTF8toUTF32( + &sourceStart,sourceStart + Fragment.size(), + &targetStart,targetStart + 4*Fragment.size(),flags); + if (result==conversionOK) + ResultPtr = reinterpret_cast(targetStart); } + assert((result != targetExhausted) + && "ConvertUTF8toUTFXX exhausted target buffer"); + return result != conversionOK; } -- 2.40.0