From b132d1b1cb0e68b1b5c85034942ada04acf0cfd8 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 30 Jan 2019 19:21:11 +0000 Subject: [PATCH] SimplifyDemandedVectorElts for all intrinsics The point is that this simplifies integration of new intrinsics into SimplifiedDemandedVectorElts, and ensures we don't miss any existing ones. This is intended to be NFC-ish, but as seen from the diffs, can produce slightly different output. This is due to order of transforms w/in instcombine resulting in two slightly different fixed points. That's something we should fix, but isn't a problem w/this patch per se. Differential Revision: https://reviews.llvm.org/D57398 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@352653 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/InstCombineCalls.cpp | 47 ++++++------------- test/Transforms/InstCombine/X86/x86-avx512.ll | 16 +++---- 2 files changed, 23 insertions(+), 40 deletions(-) diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index e8b0d521ae7..ec4b7635ce3 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1868,6 +1868,19 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (Changed) return II; } + // For vector result intrinsics, use the generic demanded vector support to + // simplify any operands before moving on to the per-intrinsic rules. + if (II->getType()->isVectorTy()) { + auto VWidth = II->getType()->getVectorNumElements(); + APInt UndefElts(VWidth, 0); + APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); + if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { + if (V != II) + return replaceInstUsesWith(*II, V); + return II; + } + } + if (Instruction *I = SimplifyNVVMIntrinsic(II, *this)) return I; @@ -2666,41 +2679,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, V); } } - LLVM_FALLTHROUGH; + break; - // X86 scalar intrinsics simplified with SimplifyDemandedVectorElts. - case Intrinsic::x86_avx512_mask_max_ss_round: - case Intrinsic::x86_avx512_mask_min_ss_round: - case Intrinsic::x86_avx512_mask_max_sd_round: - case Intrinsic::x86_avx512_mask_min_sd_round: - case Intrinsic::x86_sse_cmp_ss: - case Intrinsic::x86_sse_min_ss: - case Intrinsic::x86_sse_max_ss: - case Intrinsic::x86_sse2_cmp_sd: - case Intrinsic::x86_sse2_min_sd: - case Intrinsic::x86_sse2_max_sd: - case Intrinsic::x86_xop_vfrcz_ss: - case Intrinsic::x86_xop_vfrcz_sd: { - unsigned VWidth = II->getType()->getVectorNumElements(); - APInt UndefElts(VWidth, 0); - APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); - if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { - if (V != II) - return replaceInstUsesWith(*II, V); - return II; - } - break; - } case Intrinsic::x86_sse41_round_ss: case Intrinsic::x86_sse41_round_sd: { - unsigned VWidth = II->getType()->getVectorNumElements(); - APInt UndefElts(VWidth, 0); - APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); - if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { - if (V != II) - return replaceInstUsesWith(*II, V); - return II; - } else if (Value *V = simplifyX86round(*II, Builder)) + if (Value *V = simplifyX86round(*II, Builder)) return replaceInstUsesWith(*II, V); break; } diff --git a/test/Transforms/InstCombine/X86/x86-avx512.ll b/test/Transforms/InstCombine/X86/x86-avx512.ll index 21e2e216bed..9c5080c7792 100644 --- a/test/Transforms/InstCombine/X86/x86-avx512.ll +++ b/test/Transforms/InstCombine/X86/x86-avx512.ll @@ -7,7 +7,7 @@ declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @test_add_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP4]] @@ -38,7 +38,7 @@ define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP8]] @@ -149,7 +149,7 @@ declare <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float>, <4 x float>, define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @test_sub_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP4]] @@ -180,7 +180,7 @@ define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP8]] @@ -291,7 +291,7 @@ declare <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float>, <4 x float>, define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @test_mul_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP4]] @@ -322,7 +322,7 @@ define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP8]] @@ -433,7 +433,7 @@ declare <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float>, <4 x float>, define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @test_div_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP4]] @@ -464,7 +464,7 @@ define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP8]] -- 2.50.1