From: Philip Reames Date: Tue, 19 Mar 2019 20:10:00 +0000 (+0000) Subject: Demanded elements support for masked.load and masked.gather X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=94c6fbfa0e7a82de008a1d6440c6bb47e8f14fec;p=llvm Demanded elements support for masked.load and masked.gather Teach instcombine to propagate demanded elements through a masked load or masked gather instruction. This is in the broader context of improving vector pointer instcombine under https://reviews.llvm.org/D57140. Differential Revision: https://reviews.llvm.org/D57372 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356510 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 94e2eed05f8..b79a4d78648 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1436,6 +1436,26 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, IntrinsicInst *II = dyn_cast(I); if (!II) break; switch (II->getIntrinsicID()) { + case Intrinsic::masked_gather: // fallthrough + case Intrinsic::masked_load: { + APInt DemandedPtrs(DemandedElts), DemandedPassThrough(DemandedElts); + if (auto *CV = dyn_cast(II->getOperand(2))) + for (unsigned i = 0; i < VWidth; i++) { + Constant *CElt = CV->getAggregateElement(i); + if (CElt->isNullValue()) + DemandedPtrs.clearBit(i); + else if (CElt->isAllOnesValue()) + DemandedPassThrough.clearBit(i); + } + if (II->getIntrinsicID() == Intrinsic::masked_gather) + simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2); + simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3); + + // Output elements are undefined if the element from both sources are. + // TODO: can strengthen via mask as well. + UndefElts = UndefElts2 & UndefElts3; + break; + } case Intrinsic::x86_xop_vfrcz_ss: case Intrinsic::x86_xop_vfrcz_sd: // The instructions for these intrinsics are speced to zero upper bits not diff --git a/test/Transforms/InstCombine/X86/x86-masked-memops.ll b/test/Transforms/InstCombine/X86/x86-masked-memops.ll index 123ba1f49e1..be190007327 100644 --- a/test/Transforms/InstCombine/X86/x86-masked-memops.ll +++ b/test/Transforms/InstCombine/X86/x86-masked-memops.ll @@ -55,7 +55,7 @@ define <4 x float> @mload_real_ones(i8* %f) { define <4 x float> @mload_one_one(i8* %f) { ; CHECK-LABEL: @mload_one_one( ; CHECK-NEXT: [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[CASTVEC]], i32 1, <4 x i1> , <4 x float> zeroinitializer) +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[CASTVEC]], i32 1, <4 x i1> , <4 x float> ) ; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> ) @@ -68,7 +68,7 @@ define <4 x float> @mload_one_one(i8* %f) { define <2 x double> @mload_one_one_double(i8* %f) { ; CHECK-LABEL: @mload_one_one_double( ; CHECK-NEXT: [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[CASTVEC]], i32 1, <2 x i1> , <2 x double> zeroinitializer) +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[CASTVEC]], i32 1, <2 x i1> , <2 x double> ) ; CHECK-NEXT: ret <2 x double> [[TMP1]] ; %ld = tail call <2 x double> @llvm.x86.avx.maskload.pd(i8* %f, <2 x i64> ) @@ -81,7 +81,7 @@ define <2 x double> @mload_one_one_double(i8* %f) { define <8 x float> @mload_v8f32(i8* %f) { ; CHECK-LABEL: @mload_v8f32( ; CHECK-NEXT: [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <8 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[CASTVEC]], i32 1, <8 x i1> , <8 x float> zeroinitializer) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[CASTVEC]], i32 1, <8 x i1> , <8 x float> ) ; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %ld = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %f, <8 x i32> ) @@ -92,7 +92,7 @@ define <8 x float> @mload_v8f32(i8* %f) { define <4 x double> @mload_v4f64(i8* %f) { ; CHECK-LABEL: @mload_v4f64( ; CHECK-NEXT: [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[CASTVEC]], i32 1, <4 x i1> , <4 x double> zeroinitializer) +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[CASTVEC]], i32 1, <4 x i1> , <4 x double> ) ; CHECK-NEXT: ret <4 x double> [[TMP1]] ; %ld = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %f, <4 x i64> ) @@ -105,7 +105,7 @@ define <4 x double> @mload_v4f64(i8* %f) { define <4 x i32> @mload_v4i32(i8* %f) { ; CHECK-LABEL: @mload_v4i32( ; CHECK-NEXT: [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[CASTVEC]], i32 1, <4 x i1> , <4 x i32> zeroinitializer) +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[CASTVEC]], i32 1, <4 x i1> , <4 x i32> ) ; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; %ld = tail call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %f, <4 x i32> ) @@ -116,7 +116,7 @@ define <4 x i32> @mload_v4i32(i8* %f) { define <2 x i64> @mload_v2i64(i8* %f) { ; CHECK-LABEL: @mload_v2i64( ; CHECK-NEXT: [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[CASTVEC]], i32 1, <2 x i1> , <2 x i64> zeroinitializer) +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[CASTVEC]], i32 1, <2 x i1> , <2 x i64> ) ; CHECK-NEXT: ret <2 x i64> [[TMP1]] ; %ld = tail call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %f, <2 x i64> ) @@ -127,7 +127,7 @@ define <2 x i64> @mload_v2i64(i8* %f) { define <8 x i32> @mload_v8i32(i8* %f) { ; CHECK-LABEL: @mload_v8i32( ; CHECK-NEXT: [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[CASTVEC]], i32 1, <8 x i1> , <8 x i32> zeroinitializer) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[CASTVEC]], i32 1, <8 x i1> , <8 x i32> ) ; CHECK-NEXT: ret <8 x i32> [[TMP1]] ; %ld = tail call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %f, <8 x i32> ) @@ -138,7 +138,7 @@ define <8 x i32> @mload_v8i32(i8* %f) { define <4 x i64> @mload_v4i64(i8* %f) { ; CHECK-LABEL: @mload_v4i64( ; CHECK-NEXT: [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x i64>* -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* [[CASTVEC]], i32 1, <4 x i1> , <4 x i64> zeroinitializer) +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* [[CASTVEC]], i32 1, <4 x i1> , <4 x i64> ) ; CHECK-NEXT: ret <4 x i64> [[TMP1]] ; %ld = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %f, <4 x i64> ) diff --git a/test/Transforms/InstCombine/masked_intrinsics.ll b/test/Transforms/InstCombine/masked_intrinsics.ll index dd7fa7dd305..e685e03726c 100644 --- a/test/Transforms/InstCombine/masked_intrinsics.ll +++ b/test/Transforms/InstCombine/masked_intrinsics.ll @@ -35,19 +35,20 @@ define <2 x double> @load_undefmask(<2 x double>* %ptr, <2 x double> %passthru) } +@G = external global i8 + define <2 x double> @load_cemask(<2 x double>* %ptr, <2 x double> %passthru) { ; CHECK-LABEL: @load_cemask( -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[PTR:%.*]], i32 2, <2 x i1> , <2 x double> [[PASSTHRU:%.*]]) +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[PTR:%.*]], i32 2, <2 x i1> , <2 x double> [[PASSTHRU:%.*]]) ; CHECK-NEXT: ret <2 x double> [[RES]] ; - %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 2, <2 x i1> , <2 x double> %passthru) + %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 2, <2 x i1> , <2 x double> %passthru) ret <2 x double> %res } define <2 x double> @load_lane0(<2 x double>* %ptr, double %pt) { ; CHECK-LABEL: @load_lane0( -; CHECK-NEXT: [[PTV1:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 0 -; CHECK-NEXT: [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[PTV2:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 1 ; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[PTR:%.*]], i32 2, <2 x i1> , <2 x double> [[PTV2]]) ; CHECK-NEXT: ret <2 x double> [[RES]] ; @@ -102,7 +103,7 @@ define <2 x double> @gather_zeromask(<2 x double*> %ptrs, <2 x double> %passthru define <2 x double> @gather_onemask(<2 x double*> %ptrs, <2 x double> %passthru) { ; CHECK-LABEL: @gather_onemask( -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[PTRS:%.*]], i32 5, <2 x i1> , <2 x double> [[PASSTHRU:%.*]]) +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[PTRS:%.*]], i32 5, <2 x i1> , <2 x double> undef) ; CHECK-NEXT: ret <2 x double> [[RES]] ; %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 5, <2 x i1> , <2 x double> %passthru) @@ -112,9 +113,8 @@ define <2 x double> @gather_onemask(<2 x double*> %ptrs, <2 x double> %passthru) define <2 x double> @gather_lane0(double* %base, double %pt) { ; CHECK-LABEL: @gather_lane0( -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr double, double* [[BASE:%.*]], <2 x i64> -; CHECK-NEXT: [[PT_V1:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 0 -; CHECK-NEXT: [[PT_V2:%.*]] = shufflevector <2 x double> [[PT_V1]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[PTRS:%.*]] = getelementptr double, double* [[BASE:%.*]], <2 x i64> +; CHECK-NEXT: [[PT_V2:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 1 ; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[PTRS]], i32 5, <2 x i1> , <2 x double> [[PT_V2]]) ; CHECK-NEXT: ret <2 x double> [[RES]] ;