From: Igor Laevsky Date: Wed, 8 Feb 2017 14:32:04 +0000 (+0000) Subject: [InstCombineCalls] Unfold element atomic memcpy instruction X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ab47a3d4af915177293080ab91b67c03519cc56d;p=llvm [InstCombineCalls] Unfold element atomic memcpy instruction Differential Revision: https://reviews.llvm.org/D28909 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294453 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 13f2f9e9c72..23f8035e2aa 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -60,6 +60,12 @@ using namespace PatternMatch; STATISTIC(NumSimplified, "Number of library calls simplified"); +static cl::opt UnfoldElementAtomicMemcpyMaxElements( + "unfold-element-atomic-memcpy-max-elements", + cl::init(16), + cl::desc("Maximum number of elements in atomic memcpy the optimizer is " + "allowed to unfold")); + /// Return the specified type promoted as it would be to pass though a va_arg /// area. static Type *getPromotedType(Type *Ty) { @@ -108,6 +114,78 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { return ConstantVector::get(BoolVec); } +Instruction * +InstCombiner::SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI) { + // Try to unfold this intrinsic into sequence of explicit atomic loads and + // stores. + // First check that number of elements is compile time constant. + auto *NumElementsCI = dyn_cast(AMI->getNumElements()); + if (!NumElementsCI) + return nullptr; + + // Check that there are not too many elements. + uint64_t NumElements = NumElementsCI->getZExtValue(); + if (NumElements >= UnfoldElementAtomicMemcpyMaxElements) + return nullptr; + + // Don't unfold into illegal integers + uint64_t ElementSizeInBytes = AMI->getElementSizeInBytes() * 8; + if (!getDataLayout().isLegalInteger(ElementSizeInBytes)) + return nullptr; + + // Cast source and destination to the correct type. Intrinsic input arguments + // are usually represented as i8*. + // Often operands will be explicitly casted to i8* and we can just strip + // those casts instead of inserting new ones. However it's easier to rely on + // other InstCombine rules which will cover trivial cases anyway. + Value *Src = AMI->getRawSource(); + Value *Dst = AMI->getRawDest(); + Type *ElementPointerType = Type::getIntNPtrTy( + AMI->getContext(), ElementSizeInBytes, Src->getType()->getPointerAddressSpace()); + + Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType, + "memcpy_unfold.src_casted"); + Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType, + "memcpy_unfold.dst_casted"); + + for (uint64_t i = 0; i < NumElements; ++i) { + // Get current element addresses + ConstantInt *ElementIdxCI = + ConstantInt::get(AMI->getContext(), APInt(64, i)); + Value *SrcElementAddr = + Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr"); + Value *DstElementAddr = + Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr"); + + // Load from the source. Transfer alignment information and mark load as + // unordered atomic. + LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val"); + Load->setOrdering(AtomicOrdering::Unordered); + // We know alignment of the first element. It is also guaranteed by the + // verifier that element size is less or equal than first element alignment + // and both of this values are powers of two. + // This means that all subsequent accesses are at least element size + // aligned. + // TODO: We can infer better alignment but there is no evidence that this + // will matter. + Load->setAlignment(i == 0 ? AMI->getSrcAlignment() + : AMI->getElementSizeInBytes()); + Load->setDebugLoc(AMI->getDebugLoc()); + + // Store loaded value via unordered atomic store. + StoreInst *Store = Builder->CreateStore(Load, DstElementAddr); + Store->setOrdering(AtomicOrdering::Unordered); + Store->setAlignment(i == 0 ? AMI->getDstAlignment() + : AMI->getElementSizeInBytes()); + Store->setDebugLoc(AMI->getDebugLoc()); + } + + // Set the number of elements of the copy to 0, it will be deleted on the + // next iteration. + AMI->setNumElements(Constant::getNullValue(NumElementsCI->getType())); + return AMI; +} + Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT); unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT); @@ -1839,6 +1917,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (Constant *C = dyn_cast(AMI->getNumElements())) if (C->isNullValue()) return eraseInstFromFunction(*AMI); + + if (Instruction *I = SimplifyElementAtomicMemCpy(AMI)) + return I; } if (Instruction *I = SimplifyNVVMIntrinsic(II, *this)) diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 9e04f5d7cb0..68bae0e5200 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -650,6 +650,8 @@ private: Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI); Instruction *MatchBSwap(BinaryOperator &I); bool SimplifyStoreAtEndOfBlock(StoreInst &SI); + + Instruction *SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI); Instruction *SimplifyMemTransfer(MemIntrinsic *MI); Instruction *SimplifyMemSet(MemSetInst *MI); diff --git a/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll b/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll new file mode 100644 index 00000000000..107440f10a5 --- /dev/null +++ b/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll @@ -0,0 +1,92 @@ +; RUN: opt -instcombine -unfold-element-atomic-memcpy-max-elements=8 -S < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Test basic unfolding +define void @test1(i8* %Src, i8* %Dst) { +; CHECK-LABEL: test1 +; CHECK-NOT: llvm.memcpy.element.atomic + +; CHECK-DAG: %memcpy_unfold.src_casted = bitcast i8* %Src to i32* +; CHECK-DAG: %memcpy_unfold.dst_casted = bitcast i8* %Dst to i32* + +; CHECK-DAG: [[VAL1:%[^\s]+]] = load atomic i32, i32* %memcpy_unfold.src_casted unordered, align 4 +; CHECK-DAG: store atomic i32 [[VAL1]], i32* %memcpy_unfold.dst_casted unordered, align 8 + +; CHECK-DAG: [[VAL2:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4 +; CHECK-DAG: store atomic i32 [[VAL2]], i32* %{{[^\s]+}} unordered, align 4 + +; CHECK-DAG: [[VAL3:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4 +; CHECK-DAG: store atomic i32 [[VAL3]], i32* %{{[^\s]+}} unordered, align 4 + +; CHECK-DAG: [[VAL4:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4 +; CHECK-DAG: store atomic i32 [[VAL4]], i32* %{{[^\s]+}} unordered, align 4 +entry: + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 8 %Src, i64 4, i32 4) + ret void +} + +; Test that we don't unfold too much +define void @test2(i8* %Src, i8* %Dst) { +; CHECK-LABEL: test2 + +; CHECK-NOT: load +; CHECK-NOT: store +; CHECK: llvm.memcpy.element.atomic +entry: + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 4 %Src, i64 1000, i32 4) + ret void +} + +; Test that we will not unfold into non native integers +define void @test3(i8* %Src, i8* %Dst) { +; CHECK-LABEL: test3 + +; CHECK-NOT: load +; CHECK-NOT: store +; CHECK: llvm.memcpy.element.atomic +entry: + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 4, i32 64) + ret void +} + +; Test that we will eliminate redundant bitcasts +define void @test4(i64* %Src, i64* %Dst) { +; CHECK-LABEL: test4 +; CHECK-NOT: llvm.memcpy.element.atomic + +; CHECK-NOT: bitcast + +; CHECK-DAG: [[VAL1:%[^\s]+]] = load atomic i64, i64* %Src unordered, align 16 +; CHECK-DAG: store atomic i64 [[VAL1]], i64* %Dst unordered, align 16 + +; CHECK-DAG: [[SRC_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Src, i64 1 +; CHECK-DAG: [[DST_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 1 +; CHECK-DAG: [[VAL2:%[^\s]+]] = load atomic i64, i64* [[SRC_ADDR2]] unordered, align 8 +; CHECK-DAG: store atomic i64 [[VAL2]], i64* [[DST_ADDR2]] unordered, align 8 + +; CHECK-DAG: [[SRC_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Src, i64 2 +; CHECK-DAG: [[DST_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 2 +; CHECK-DAG: [[VAL3:%[^ ]+]] = load atomic i64, i64* [[SRC_ADDR3]] unordered, align 8 +; CHECK-DAG: store atomic i64 [[VAL3]], i64* [[DST_ADDR3]] unordered, align 8 + +; CHECK-DAG: [[SRC_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Src, i64 3 +; CHECK-DAG: [[DST_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 3 +; CHECK-DAG: [[VAL4:%[^ ]+]] = load atomic i64, i64* [[SRC_ADDR4]] unordered, align 8 +; CHECK-DAG: store atomic i64 [[VAL4]], i64* [[DST_ADDR4]] unordered, align 8 +entry: + %Src.casted = bitcast i64* %Src to i8* + %Dst.casted = bitcast i64* %Dst to i8* + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 16 %Dst.casted, i8* align 16 %Src.casted, i64 4, i32 8) + ret void +} + +define void @test5(i8* %Src, i8* %Dst) { +; CHECK-LABEL: test5 + +; CHECK-NOT: llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64) +entry: + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64) + ret void +} + +declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* nocapture, i8* nocapture, i64, i32)