From 07d3c0f01c56c4fe13ffd6a7194ca4aa0f5513a1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 16 Jan 2017 11:30:41 +0000 Subject: [PATCH] [InstCombine][SSE] Add DemandedElts support for PSHUFB instructions Simplify a pshufb shuffle mask based on the elements of the mask that are actually demanded. Differential Revision: https://reviews.llvm.org/D28745 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@292101 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/InstCombineCalls.cpp | 12 +++++++++- .../InstCombineSimplifyDemanded.cpp | 10 ++++++++ test/Transforms/InstCombine/x86-pshufb.ll | 23 ++++++++----------- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 2ef82ba3ed8..ec2ebaaed88 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2315,10 +2315,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_ssse3_pshuf_b_128: case Intrinsic::x86_avx2_pshuf_b: - case Intrinsic::x86_avx512_pshuf_b_512: + case Intrinsic::x86_avx512_pshuf_b_512: { if (Value *V = simplifyX86pshufb(*II, *Builder)) return replaceInstUsesWith(*II, V); + + unsigned VWidth = II->getType()->getVectorNumElements(); + APInt UndefElts(VWidth, 0); + APInt DemandedElts = APInt::getAllOnesValue(VWidth); + if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) { + if (V != II) + return replaceInstUsesWith(*II, V); + return II; + } break; + } case Intrinsic::x86_avx_vpermilvar_ps: case Intrinsic::x86_avx_vpermilvar_ps_256: diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 8b930bd95df..95100d074b4 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1472,6 +1472,16 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, break; } + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + case Intrinsic::x86_avx512_pshuf_b_512: { + Value *Op1 = II->getArgOperand(1); + TmpV = SimplifyDemandedVectorElts(Op1, DemandedElts, UndefElts, + Depth + 1); + if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; } + break; + } + // SSE4A instructions leave the upper 64-bits of the 128-bit result // in an undefined state. case Intrinsic::x86_sse4a_extrq: diff --git a/test/Transforms/InstCombine/x86-pshufb.ll b/test/Transforms/InstCombine/x86-pshufb.ll index 7da216f7e48..f181ef57fe2 100644 --- a/test/Transforms/InstCombine/x86-pshufb.ll +++ b/test/Transforms/InstCombine/x86-pshufb.ll @@ -469,15 +469,12 @@ define <64 x i8> @fold_with_allundef_elts_avx512(<64 x i8> %InVec) { } ; Demanded elts tests. -; FIXME: Missed opportunities to pass demanded elts through the pshufb shuffle mask define <16 x i8> @demanded_elts_insertion(<16 x i8> %InVec, <16 x i8> %BaseMask, i8 %M0, i8 %M15) { ; CHECK-LABEL: @demanded_elts_insertion( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> %BaseMask, i8 %M0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 %M15, i32 15 -; CHECK-NEXT: [[TMP3:%.*]] = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: ret <16 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> %BaseMask) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP2]] ; %1 = insertelement <16 x i8> %BaseMask, i8 %M0, i32 0 %2 = insertelement <16 x i8> %1, i8 %M15, i32 15 @@ -489,9 +486,8 @@ define <16 x i8> @demanded_elts_insertion(<16 x i8> %InVec, <16 x i8> %BaseMask, define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %BaseMask, i8 %M0, i8 %M22) { ; CHECK-LABEL: @demanded_elts_insertion_avx2( ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <32 x i8> [[TMP1]], i8 %M22, i32 22 -; CHECK-NEXT: [[TMP3:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> [[TMP2]]) -; CHECK-NEXT: ret <32 x i8> [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> [[TMP1]]) +; CHECK-NEXT: ret <32 x i8> [[TMP2]] ; %1 = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0 %2 = insertelement <32 x i8> %1, i8 %M22, i32 22 @@ -502,11 +498,10 @@ define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %Base define <64 x i8> @demanded_elts_insertion_avx512(<64 x i8> %InVec, <64 x i8> %BaseMask, i8 %M0, i8 %M30) { ; CHECK-LABEL: @demanded_elts_insertion_avx512( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <64 x i8> %BaseMask, i8 %M0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <64 x i8> [[TMP1]], i8 %M30, i32 30 -; CHECK-NEXT: [[TMP3:%.*]] = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <64 x i8> [[TMP3]], <64 x i8> undef, <64 x i32> zeroinitializer -; CHECK-NEXT: ret <64 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <64 x i8> undef, i8 %M0, i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: ret <64 x i8> [[TMP3]] ; %1 = insertelement <64 x i8> %BaseMask, i8 %M0, i32 0 %2 = insertelement <64 x i8> %1, i8 %M30, i32 30 -- 2.50.1