From 2f752ea08343e491bb1915cde2bcfd91d1af2d92 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 23 Jan 2017 15:22:59 +0000 Subject: [PATCH] [InstCombine][X86] Add MULDQ/MULUDQ constant folding support git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@292793 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/InstCombineCalls.cpp | 43 +++++++++++++++++-- test/Transforms/InstCombine/x86-muldq.ll | 36 ++++++---------- 2 files changed, 52 insertions(+), 27 deletions(-) diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 9f9bf4094c5..e6e126bf784 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -510,16 +510,53 @@ static Value *simplifyX86varShift(const IntrinsicInst &II, return Builder.CreateAShr(Vec, ShiftVec); } -static Value *simplifyX86muldq(const IntrinsicInst &II) { +static Value *simplifyX86muldq(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { Value *Arg0 = II.getArgOperand(0); Value *Arg1 = II.getArgOperand(1); Type *ResTy = II.getType(); + assert(Arg0->getType()->getScalarSizeInBits() == 32 && + Arg1->getType()->getScalarSizeInBits() == 32 && + ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types"); // muldq/muludq(undef, undef) -> zero (matches generic mul behavior) if (isa(Arg0) && isa(Arg1)) return ConstantAggregateZero::get(ResTy); - return nullptr; + // Constant folding. + // PMULDQ = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)), + // vXi64 sext(shuffle<0,2,..>(Arg1)))) + // PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)), + // vXi64 zext(shuffle<0,2,..>(Arg1)))) + if (!isa(Arg0) || !isa(Arg1)) + return nullptr; + + unsigned NumElts = ResTy->getVectorNumElements(); + assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) && + Arg1->getType()->getVectorNumElements() == (2 * NumElts) && + "Unexpected muldq/muludq types"); + + unsigned IntrinsicID = II.getIntrinsicID(); + bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID || + Intrinsic::x86_avx2_pmul_dq == IntrinsicID || + Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID); + + SmallVector ShuffleMask; + for (unsigned i = 0; i != NumElts; ++i) + ShuffleMask.push_back(i * 2); + + auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask); + auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask); + + if (IsSigned) { + LHS = Builder.CreateSExt(LHS, ResTy); + RHS = Builder.CreateSExt(RHS, ResTy); + } else { + LHS = Builder.CreateZExt(LHS, ResTy); + RHS = Builder.CreateZExt(RHS, ResTy); + } + + return Builder.CreateMul(LHS, RHS); } static Value *simplifyX86movmsk(const IntrinsicInst &II, @@ -2154,7 +2191,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_avx2_pmulu_dq: case Intrinsic::x86_avx512_pmul_dq_512: case Intrinsic::x86_avx512_pmulu_dq_512: { - if (Value *V = simplifyX86muldq(*II)) + if (Value *V = simplifyX86muldq(*II, *Builder)) return replaceInstUsesWith(*II, V); unsigned VWidth = II->getType()->getVectorNumElements(); diff --git a/test/Transforms/InstCombine/x86-muldq.ll b/test/Transforms/InstCombine/x86-muldq.ll index 70d4ab22d2d..bcbb8919c40 100644 --- a/test/Transforms/InstCombine/x86-muldq.ll +++ b/test/Transforms/InstCombine/x86-muldq.ll @@ -55,8 +55,7 @@ define <8 x i64> @undef_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) { define <2 x i64> @undef_zero_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: @undef_zero_pmuludq_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> ) -; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; CHECK-NEXT: ret <2 x i64> zeroinitializer ; %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> zeroinitializer) ret <2 x i64> %1 @@ -64,8 +63,7 @@ define <2 x i64> @undef_zero_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i64> @undef_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: @undef_zero_pmuludq_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> , <8 x i32> undef) -; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; CHECK-NEXT: ret <4 x i64> zeroinitializer ; %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> undef) ret <4 x i64> %1 @@ -73,8 +71,7 @@ define <4 x i64> @undef_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) { define <8 x i64> @undef_zero_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: @undef_zero_pmuludq_512( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> ) -; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; CHECK-NEXT: ret <8 x i64> zeroinitializer ; %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> zeroinitializer) ret <8 x i64> %1 @@ -82,8 +79,7 @@ define <8 x i64> @undef_zero_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) { define <2 x i64> @undef_zero_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: @undef_zero_pmuldq_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> , <4 x i32> undef) -; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; CHECK-NEXT: ret <2 x i64> zeroinitializer ; %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> zeroinitializer, <4 x i32> undef) ret <2 x i64> %1 @@ -91,8 +87,7 @@ define <2 x i64> @undef_zero_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i64> @undef_zero_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: @undef_zero_pmuldq_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> ) -; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; CHECK-NEXT: ret <4 x i64> zeroinitializer ; %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> zeroinitializer) ret <4 x i64> %1 @@ -100,8 +95,7 @@ define <4 x i64> @undef_zero_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) { define <8 x i64> @undef_zero_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: @undef_zero_pmuldq_512( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> , <16 x i32> undef) -; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; CHECK-NEXT: ret <8 x i64> zeroinitializer ; %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> undef) ret <8 x i64> %1 @@ -113,8 +107,7 @@ define <8 x i64> @undef_zero_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) { define <2 x i64> @fold_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: @fold_pmuludq_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> , <4 x i32> ) -; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; CHECK-NEXT: ret <2 x i64> ; %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> , <4 x i32> ) ret <2 x i64> %1 @@ -122,8 +115,7 @@ define <2 x i64> @fold_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i64> @fold_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: @fold_pmuludq_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> , <8 x i32> ) -; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; CHECK-NEXT: ret <4 x i64> zeroinitializer ; %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer) ret <4 x i64> %1 @@ -131,8 +123,7 @@ define <4 x i64> @fold_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) { define <8 x i64> @fold_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: @fold_pmuludq_512( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> , <16 x i32> ) -; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; CHECK-NEXT: ret <8 x i64> ; %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> , <16 x i32> ) ret <8 x i64> %1 @@ -140,8 +131,7 @@ define <8 x i64> @fold_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) { define <2 x i64> @fold_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: @fold_pmuldq_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> , <4 x i32> ) -; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; CHECK-NEXT: ret <2 x i64> ; %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> , <4 x i32> ) ret <2 x i64> %1 @@ -149,8 +139,7 @@ define <2 x i64> @fold_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i64> @fold_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: @fold_pmuldq_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> , <8 x i32> ) -; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; CHECK-NEXT: ret <4 x i64> ; %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> , <8 x i32> ) ret <4 x i64> %1 @@ -158,8 +147,7 @@ define <4 x i64> @fold_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) { define <8 x i64> @fold_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: @fold_pmuldq_512( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> , <16 x i32> ) -; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; CHECK-NEXT: ret <8 x i64> zeroinitializer ; %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> ) ret <8 x i64> %1 -- 2.40.0