From: Daniel Dunbar Date: Thu, 23 Jul 2009 23:41:22 +0000 (+0000) Subject: Output UTF-16 string literals independent of host byte order. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=70ee975fad4653fa09f8e77f9a46a7b1f592ef59;p=clang Output UTF-16 string literals independent of host byte order. - Steve, can you take a look at this? It seems like this code should live elsewhere, and there is a FIXME about having Sema validates the UTF-8 to UTF-16 conversion. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@76915 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp index 994f60b015..803df31a6f 100644 --- a/lib/CodeGen/CodeGenModule.cpp +++ b/lib/CodeGen/CodeGenModule.cpp @@ -1193,6 +1193,7 @@ static void appendFieldAndPadding(CodeGenModule &CGM, static llvm::StringMapEntry & GetConstantCFStringEntry(llvm::StringMap &Map, const StringLiteral *Literal, + bool TargetIsLSB, bool &IsUTF16, unsigned &StringLength) { unsigned NumBytes = Literal->getByteLength(); @@ -1223,15 +1224,28 @@ GetConstantCFStringEntry(llvm::StringMap &Map, StringLength)); } - // FIXME: Storing UTF-16 in a C string is a hack to test Unicode strings - // without doing more surgery to this routine. Since we aren't explicitly - // checking for endianness here, it's also a bug (when generating code for - // a target that doesn't match the host endianness). Modeling this as an - // i16 array is likely the cleanest solution. + // ConvertUTF8toUTF16 returns the length in ToPtr. StringLength = ToPtr - &ToBuf[0]; + + // Render the UTF-16 string into a byte array and convert to the target byte + // order. + // + // FIXME: This isn't something we should need to do here. + llvm::SmallString<128> AsBytes; + AsBytes.reserve(StringLength * 2); + for (unsigned i = 0; i != StringLength; ++i) { + unsigned short Val = ToBuf[i]; + if (TargetIsLSB) { + AsBytes.push_back(Val & 0xFF); + AsBytes.push_back(Val >> 8); + } else { + AsBytes.push_back(Val >> 8); + AsBytes.push_back(Val & 0xFF); + } + } + IsUTF16 = true; - return Map.GetOrCreateValue(llvm::StringRef((char *)&ToBuf[0], - StringLength * 2)); + return Map.GetOrCreateValue(llvm::StringRef(AsBytes.data(), AsBytes.size())); } llvm::Constant * @@ -1239,8 +1253,9 @@ CodeGenModule::GetAddrOfConstantCFString(const StringLiteral *Literal) { unsigned StringLength = 0; bool isUTF16 = false; llvm::StringMapEntry &Entry = - GetConstantCFStringEntry(CFConstantStringMap, Literal, isUTF16, - StringLength); + GetConstantCFStringEntry(CFConstantStringMap, Literal, + getTargetData().isLittleEndian(), + isUTF16, StringLength); if (llvm::Constant *C = Entry.getValue()) return C; diff --git a/test/CodeGen/darwin-string-literals.c b/test/CodeGen/darwin-string-literals.c index 90662d15e6..2f94d557ca 100644 --- a/test/CodeGen/darwin-string-literals.c +++ b/test/CodeGen/darwin-string-literals.c @@ -1,8 +1,14 @@ -// RUN: clang-cc -triple i386-apple-darwin9 -emit-llvm %s -o - | FileCheck %s +// RUN: clang-cc -triple i386-apple-darwin9 -emit-llvm %s -o - | FileCheck -check-prefix LSB %s -// CHECK: @.str = private constant [8 x i8] c"string0\00" -// CHECK: @.str1 = private constant [8 x i8] c"string1\00", section "__TEXT,__cstring,cstring_literals" -// CHECK: @__utf16_string_ = internal global [35 x i8] c"h\00e\00l\00l\00o\00 \00\92! \00\03& \00\90! \00w\00o\00r\00l\00d\00\00", section "__TEXT,__ustring", align 2 +// CHECK-LSB: @.str = private constant [8 x i8] c"string0\00" +// CHECK-LSB: @.str1 = private constant [8 x i8] c"string1\00", section "__TEXT,__cstring,cstring_literals" +// CHECK-LSB: @__utf16_string_ = internal global [35 x i8] c"h\00e\00l\00l\00o\00 \00\92! \00\03& \00\90! \00w\00o\00r\00l\00d\00\00", section "__TEXT,__ustring", align 2 + +// RUN: clang-cc -triple powerpc-apple-darwin9 -emit-llvm %s -o - | FileCheck -check-prefix MSB %s + +// CHECK-MSB: @.str = private constant [8 x i8] c"string0\00" +// CHECK-MSB: @.str1 = private constant [8 x i8] c"string1\00", section "__TEXT,__cstring,cstring_literals" +// CHECK-MSB: @__utf16_string_ = internal global [35 x i8] c"\00h\00e\00l\00l\00o\00 !\92\00 &\03\00 !\90\00 \00w\00o\00r\00l\00d\00", section "__TEXT,__ustring", align 2 const char *g0 = "string0"; const void *g1 = __builtin___CFStringMakeConstantString("string1");