Demanded elements support for masked.load and masked.gather

author Philip Reames <listmail@philipreames.com>

Tue, 19 Mar 2019 20:10:00 +0000 (20:10 +0000)

committer Philip Reames <listmail@philipreames.com>

Tue, 19 Mar 2019 20:10:00 +0000 (20:10 +0000)
author Philip Reames <listmail@philipreames.com>
Tue, 19 Mar 2019 20:10:00 +0000 (20:10 +0000)
committer Philip Reames <listmail@philipreames.com>
Tue, 19 Mar 2019 20:10:00 +0000 (20:10 +0000)
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp

index 94e2eed05f8845dc2da0219d7e4aaad19d6a72a8..b79a4d78648becec0635de1d4784fb8a161dcc75 100644 (file)
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1436,6 +1436,26 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
      IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
      if (!II) break;
      switch (II->getIntrinsicID()) {
+    case Intrinsic::masked_gather: // fallthrough
+    case Intrinsic::masked_load: {
+      APInt DemandedPtrs(DemandedElts), DemandedPassThrough(DemandedElts);
+      if (auto *CV = dyn_cast<ConstantVector>(II->getOperand(2)))
+        for (unsigned i = 0; i < VWidth; i++) {
+          Constant *CElt = CV->getAggregateElement(i);
+          if (CElt->isNullValue())
+            DemandedPtrs.clearBit(i);
+          else if (CElt->isAllOnesValue())
+            DemandedPassThrough.clearBit(i);
+        }
+      if (II->getIntrinsicID() == Intrinsic::masked_gather)
+        simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2);
+      simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3);
+      
+      // Output elements are undefined if the element from both sources are.
+      // TODO: can strengthen via mask as well.
+      UndefElts = UndefElts2 & UndefElts3;
+      break;
+    }
      case Intrinsic::x86_xop_vfrcz_ss:
      case Intrinsic::x86_xop_vfrcz_sd:
        // The instructions for these intrinsics are speced to zero upper bits not
diff --git a/test/Transforms/InstCombine/X86/x86-masked-memops.ll b/test/Transforms/InstCombine/X86/x86-masked-memops.ll

index 123ba1f49e1ae3ebb12b975bcef9466657cc58a3..be190007327d8e9f6bab299308562f0ee8b3d784 100644 (file)
--- a/test/Transforms/InstCombine/X86/x86-masked-memops.ll
+++ b/test/Transforms/InstCombine/X86/x86-masked-memops.ll
@@ -55,7 +55,7 @@ define <4 x float> @mload_real_ones(i8* %f) {
  define <4 x float> @mload_one_one(i8* %f) {
  ; CHECK-LABEL: @mload_one_one(
  ; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>)
  ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
  ;
    %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
@@ -68,7 +68,7 @@ define <4 x float> @mload_one_one(i8* %f) {
  define <2 x double> @mload_one_one_double(i8* %f) {
  ; CHECK-LABEL: @mload_one_one_double(
  ; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <2 x double>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x double> <double undef, double 0.000000e+00>)
  ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
  ;
    %ld = tail call <2 x double> @llvm.x86.avx.maskload.pd(i8* %f, <2 x i64> <i64 -1, i64 0>)
@@ -81,7 +81,7 @@ define <2 x double> @mload_one_one_double(i8* %f) {
  define <8 x float> @mload_v8f32(i8* %f) {
  ; CHECK-LABEL: @mload_v8f32(
  ; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <8 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
  ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
  ;
    %ld = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
@@ -92,7 +92,7 @@ define <8 x float> @mload_v8f32(i8* %f) {
  define <4 x double> @mload_v4f64(i8* %f) {
  ; CHECK-LABEL: @mload_v4f64(
  ; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x double>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> <double undef, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>)
  ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
  ;
    %ld = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
@@ -105,7 +105,7 @@ define <4 x double> @mload_v4f64(i8* %f) {
  define <4 x i32> @mload_v4i32(i8* %f) {
  ; CHECK-LABEL: @mload_v4i32(
  ; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 0, i32 0, i32 0, i32 undef>)
  ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
  ;
    %ld = tail call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
@@ -116,7 +116,7 @@ define <4 x i32> @mload_v4i32(i8* %f) {
  define <2 x i64> @mload_v2i64(i8* %f) {
  ; CHECK-LABEL: @mload_v2i64(
  ; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x i64> <i64 undef, i64 0>)
  ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
  ;
    %ld = tail call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %f, <2 x i64> <i64 -1, i64 0>)
@@ -127,7 +127,7 @@ define <2 x i64> @mload_v2i64(i8* %f) {
  define <8 x i32> @mload_v8i32(i8* %f) {
  ; CHECK-LABEL: @mload_v8i32(
  ; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <8 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> <i32 0, i32 0, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0>)
  ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
  ;
    %ld = tail call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
@@ -138,7 +138,7 @@ define <8 x i32> @mload_v8i32(i8* %f) {
  define <4 x i64> @mload_v4i64(i8* %f) {
  ; CHECK-LABEL: @mload_v4i64(
  ; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x i64>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> <i64 undef, i64 0, i64 0, i64 0>)
  ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
  ;
    %ld = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
diff --git a/test/Transforms/InstCombine/masked_intrinsics.ll b/test/Transforms/InstCombine/masked_intrinsics.ll

index dd7fa7dd3059a28ee076a522f6fd434d7895cbc2..e685e03726c06229e296e973ba2e76e0824a1b6d 100644 (file)
--- a/test/Transforms/InstCombine/masked_intrinsics.ll
+++ b/test/Transforms/InstCombine/masked_intrinsics.ll
@@ -35,19 +35,20 @@ define <2 x double> @load_undefmask(<2 x double>* %ptr, <2 x double> %passthru)
  
  }
  
+@G = external global i8
+
  define <2 x double> @load_cemask(<2 x double>* %ptr, <2 x double> %passthru)  {
  ; CHECK-LABEL: @load_cemask(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[PTR:%.*]], i32 2, <2 x i1> <i1 true, i1 false>, <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[PTR:%.*]], i32 2, <2 x i1> <i1 true, i1 ptrtoint (i8* @G to i1)>, <2 x double> [[PASSTHRU:%.*]])
  ; CHECK-NEXT:    ret <2 x double> [[RES]]
  ;
-  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 2, <2 x i1> <i1 1, i1 trunc (i32 0 to i1)>, <2 x double> %passthru)
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 2, <2 x i1> <i1 1, i1 ptrtoint (i8* @G to i1)>, <2 x double> %passthru)
    ret <2 x double> %res
  }
  
  define <2 x double> @load_lane0(<2 x double>* %ptr, double %pt)  {
  ; CHECK-LABEL: @load_lane0(
-; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 0
-; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTV2:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 1
  ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[PTR:%.*]], i32 2, <2 x i1> <i1 true, i1 false>, <2 x double> [[PTV2]])
  ; CHECK-NEXT:    ret <2 x double> [[RES]]
  ;
@@ -102,7 +103,7 @@ define <2 x double> @gather_zeromask(<2 x double*> %ptrs, <2 x double> %passthru
  
  define <2 x double> @gather_onemask(<2 x double*> %ptrs, <2 x double> %passthru)  {
  ; CHECK-LABEL: @gather_onemask(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[PTRS:%.*]], i32 5, <2 x i1> <i1 true, i1 true>, <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[PTRS:%.*]], i32 5, <2 x i1> <i1 true, i1 true>, <2 x double> undef)
  ; CHECK-NEXT:    ret <2 x double> [[RES]]
  ;
    %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 5, <2 x i1> <i1 true, i1 true>, <2 x double> %passthru)
@@ -112,9 +113,8 @@ define <2 x double> @gather_onemask(<2 x double*> %ptrs, <2 x double> %passthru)
  
  define <2 x double> @gather_lane0(double* %base, double %pt)  {
  ; CHECK-LABEL: @gather_lane0(
-; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, double* [[BASE:%.*]], <2 x i64> <i64 0, i64 1>
-; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 0
-; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <2 x double> [[PT_V1]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, double* [[BASE:%.*]], <2 x i64> <i64 0, i64 undef>
+; CHECK-NEXT:    [[PT_V2:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 1
  ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[PTRS]], i32 5, <2 x i1> <i1 true, i1 false>, <2 x double> [[PT_V2]])
  ; CHECK-NEXT:    ret <2 x double> [[RES]]
  ;
author	Philip Reames <listmail@philipreames.com>
	Tue, 19 Mar 2019 20:10:00 +0000 (20:10 +0000)
committer	Philip Reames <listmail@philipreames.com>
	Tue, 19 Mar 2019 20:10:00 +0000 (20:10 +0000)
lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp		patch \| blob \| history
test/Transforms/InstCombine/X86/x86-masked-memops.ll		patch \| blob \| history
test/Transforms/InstCombine/masked_intrinsics.ll		patch \| blob \| history