From ce70016434ff82a29a60ef82894d934b8a23f23d Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Mon, 28 Jun 2010 23:44:11 +0000 Subject: [PATCH] Change CGCall to handle the "coerce" case where the coerce-to type is a FCA to pass each of the elements as individual scalars. This produces code fast isel is less likely to reject and is easier on the optimizers. For example, before we would compile: struct DeclGroup { long NumDecls; char * Y; }; char * foo(DeclGroup D) { return D.NumDecls+D.Y; } to: %struct.DeclGroup = type { i64, i64 } define i64 @_Z3foo9DeclGroup(%struct.DeclGroup) nounwind { entry: %D = alloca %struct.DeclGroup, align 8 ; <%struct.DeclGroup*> [#uses=3] store %struct.DeclGroup %0, %struct.DeclGroup* %D, align 1 %tmp = getelementptr inbounds %struct.DeclGroup* %D, i32 0, i32 0 ; [#uses=1] %tmp1 = load i64* %tmp ; [#uses=1] %tmp2 = getelementptr inbounds %struct.DeclGroup* %D, i32 0, i32 1 ; [#uses=1] %tmp3 = load i64* %tmp2 ; [#uses=1] %add = add nsw i64 %tmp1, %tmp3 ; [#uses=1] ret i64 %add } Now we get: %0 = type { i64, i64 } %struct.DeclGroup = type { i64, i8* } define i8* @_Z3foo9DeclGroup(i64, i64) nounwind { entry: %D = alloca %struct.DeclGroup, align 8 ; <%struct.DeclGroup*> [#uses=3] %2 = insertvalue %0 undef, i64 %0, 0 ; <%0> [#uses=1] %3 = insertvalue %0 %2, i64 %1, 1 ; <%0> [#uses=1] %4 = bitcast %struct.DeclGroup* %D to %0* ; <%0*> [#uses=1] store %0 %3, %0* %4, align 1 %tmp = getelementptr inbounds %struct.DeclGroup* %D, i32 0, i32 0 ; [#uses=1] %tmp1 = load i64* %tmp ; [#uses=1] %tmp2 = getelementptr inbounds %struct.DeclGroup* %D, i32 0, i32 1 ; [#uses=1] %tmp3 = load i8** %tmp2 ; [#uses=1] %add.ptr = getelementptr inbounds i8* %tmp3, i64 %tmp1 ; [#uses=1] ret i8* %add.ptr } Elimination of the FCA inside the function is still-to-come. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@107099 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CGCall.cpp | 71 +++++++++++++++++++++++----- test/CodeGen/x86_64-arguments.c | 2 +- test/CodeGenCXX/x86_64-arguments.cpp | 8 ++-- 3 files changed, 65 insertions(+), 16 deletions(-) diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp index 9719dfa432..eb517edd81 100644 --- a/lib/CodeGen/CGCall.cpp +++ b/lib/CodeGen/CGCall.cpp @@ -240,7 +240,8 @@ const CGFunctionInfo &CodeGenTypes::getFunctionInfo(CanQualType ResTy, return *FI; // Construct the function info. - FI = new CGFunctionInfo(CC, Info.getNoReturn(), Info.getRegParm(), ResTy, ArgTys); + FI = new CGFunctionInfo(CC, Info.getNoReturn(), Info.getRegParm(), ResTy, + ArgTys); FunctionInfos.InsertNode(FI, InsertPos); // Compute ABI information. @@ -259,6 +260,8 @@ CGFunctionInfo::CGFunctionInfo(unsigned _CallingConvention, NoReturn(_NoReturn), RegParm(_RegParm) { NumArgs = ArgTys.size(); + + // FIXME: Coallocate with the CGFunctionInfo object. Args = new ArgInfo[1 + NumArgs]; Args[0].type = ResTy; for (unsigned i = 0; i < NumArgs; ++i) @@ -593,9 +596,19 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI, bool IsVariadic) { case ABIArgInfo::Ignore: break; - case ABIArgInfo::Coerce: - ArgTys.push_back(AI.getCoerceToType()); + case ABIArgInfo::Coerce: { + // If the coerce-to type is a first class aggregate, flatten it. Either + // way is semantically identical, but fast-isel and the optimizer + // generally likes scalar values better than FCAs. + const llvm::Type *ArgTy = AI.getCoerceToType(); + if (const llvm::StructType *STy = dyn_cast(ArgTy)) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) + ArgTys.push_back(STy->getElementType(i)); + } else { + ArgTys.push_back(ArgTy); + } break; + } case ABIArgInfo::Indirect: { // indirect arguments are always on the stack, which is addr space #0. @@ -713,7 +726,12 @@ void CodeGenModule::ConstructAttributeList(const CGFunctionInfo &FI, switch (AI.getKind()) { case ABIArgInfo::Coerce: - break; + if (const llvm::StructType *STy = + dyn_cast(AI.getCoerceToType())) + Index += STy->getNumElements(); + else + ++Index; + continue; // Skip index increment. case ABIArgInfo::Indirect: if (AI.getIndirectByVal()) @@ -806,7 +824,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, switch (ArgI.getKind()) { case ABIArgInfo::Indirect: { - llvm::Value* V = AI; + llvm::Value *V = AI; if (hasAggregateLLVMType(Ty)) { // Do nothing, aggregates and complex variables are accessed by // reference. @@ -826,7 +844,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, case ABIArgInfo::Extend: case ABIArgInfo::Direct: { assert(AI != Fn->arg_end() && "Argument mismatch!"); - llvm::Value* V = AI; + llvm::Value *V = AI; if (hasAggregateLLVMType(Ty)) { // Create a temporary alloca to hold the argument; the rest of // codegen expects to access aggregates & complex values by @@ -876,12 +894,29 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, continue; case ABIArgInfo::Coerce: { - assert(AI != Fn->arg_end() && "Argument mismatch!"); + // If the coerce-to type is a first class aggregate, we flatten it and + // pass the elements. Either way is semantically identical, but fast-isel + // and the optimizer generally likes scalar values better than FCAs. + llvm::Value *FormalArg; + if (const llvm::StructType *STy = + dyn_cast(ArgI.getCoerceToType())) { + // Reconstruct the FCA here. + // FIXME: If we have a direct match, do nice gep/store series. + FormalArg = llvm::UndefValue::get(STy); + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + assert(AI != Fn->arg_end() && "Argument mismatch!"); + FormalArg = Builder.CreateInsertValue(FormalArg, AI++, i); + } + } else { + assert(AI != Fn->arg_end() && "Argument mismatch!"); + FormalArg = AI++; + } + // FIXME: This is very wasteful; EmitParmDecl is just going to drop the // result in a new alloca anyway, so we could just store into that // directly if we broke the abstraction down more. llvm::Value *V = CreateMemTemp(Ty, "coerce"); - CreateCoercedStore(AI, V, /*DestIsVolatile=*/false, *this); + CreateCoercedStore(FormalArg, V, /*DestIsVolatile=*/false, *this); // Match to what EmitParmDecl is expecting for this type. if (!CodeGenFunction::hasAggregateLLVMType(Ty)) { V = EmitLoadOfScalar(V, false, Ty); @@ -892,7 +927,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, } } EmitParmDecl(*Arg, V); - break; + continue; // Skip ++AI increment, already done. } } @@ -1080,8 +1115,22 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, StoreComplexToAddr(RV.getComplexVal(), SrcPtr, false); } else SrcPtr = RV.getAggregateAddr(); - Args.push_back(CreateCoercedLoad(SrcPtr, ArgInfo.getCoerceToType(), - *this)); + + llvm::Value *SrcVal = + CreateCoercedLoad(SrcPtr, ArgInfo.getCoerceToType(), *this); + + // If the coerce-to type is a first class aggregate, we flatten it and + // pass the elements. Either way is semantically identical, but fast-isel + // and the optimizer generally likes scalar values better than FCAs. + if (const llvm::StructType *STy = + dyn_cast(SrcVal->getType())) { + // Extract the elements of the value to pass in. + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) + Args.push_back(Builder.CreateExtractValue(SrcVal, i)); + } else { + Args.push_back(SrcVal); + } + break; } diff --git a/test/CodeGen/x86_64-arguments.c b/test/CodeGen/x86_64-arguments.c index 2422390108..2a96f22e1d 100644 --- a/test/CodeGen/x86_64-arguments.c +++ b/test/CodeGen/x86_64-arguments.c @@ -45,7 +45,7 @@ void f7(e7 a0) { // Test merging/passing of upper eightbyte with X87 class. // // CHECK: define %0 @f8_1() -// CHECK: define void @f8_2(%0) +// CHECK: define void @f8_2(i64, double) union u8 { long double a; int b; diff --git a/test/CodeGenCXX/x86_64-arguments.cpp b/test/CodeGenCXX/x86_64-arguments.cpp index 4bc83b8513..bc2f312aea 100644 --- a/test/CodeGenCXX/x86_64-arguments.cpp +++ b/test/CodeGenCXX/x86_64-arguments.cpp @@ -6,19 +6,19 @@ // Basic base class test. struct f0_s0 { unsigned a; }; struct f0_s1 : public f0_s0 { void *b; }; -// CHECK: define void @_Z2f05f0_s1([[i64_i64_ty]]) +// CHECK: define void @_Z2f05f0_s1(i64, i64) void f0(f0_s1 a0) { } // Check with two eight-bytes in base class. struct f1_s0 { unsigned a; unsigned b; float c; }; struct f1_s1 : public f1_s0 { float d;}; -// CHECK: define void @_Z2f15f1_s1([[i64_double_ty]]) +// CHECK: define void @_Z2f15f1_s1(i64, double) void f1(f1_s1 a0) { } // Check with two eight-bytes in base class and merge. struct f2_s0 { unsigned a; unsigned b; float c; }; struct f2_s1 : public f2_s0 { char d;}; -// CHECK: define void @_Z2f25f2_s1([[i64_i64_ty]]) +// CHECK: define void @_Z2f25f2_s1(i64, i64) void f2(f2_s1 a0) { } // PR5831 @@ -27,7 +27,7 @@ struct s3_1 { struct s3_0 a; long b; }; void f3(struct s3_1 x) {} // CHECK: define i64 @_Z4f4_0M2s4i(i64) -// CHECK: define [[i64_i64_ty]] @_Z4f4_1M2s4FivE([[i64_i64_ty]]) +// CHECK: define [[i64_i64_ty]] @_Z4f4_1M2s4FivE(i64, i64) struct s4 {}; typedef int s4::* s4_mdp; typedef int (s4::*s4_mfp)(); -- 2.40.0