From d6a2755daa3b51ae99156e996fd9033f207d9cab Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 1 May 2019 14:50:50 +0000 Subject: [PATCH] [X86][SSE] Add demanded elts support X86ISD::PMULDQ\PMULUDQ Add to SimplifyDemandedVectorEltsForTargetNode and SimplifyDemandedBitsForTargetNode git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359686 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 27 ++- test/CodeGen/X86/vector-reduce-mul-widen.ll | 178 ++++++++++---------- test/CodeGen/X86/vector-reduce-mul.ll | 178 ++++++++++---------- 3 files changed, 202 insertions(+), 181 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0cc7c157b74..80e83544cbb 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -33237,6 +33237,22 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // Handle special case opcodes. switch (Opc) { + case X86ISD::PMULDQ: + case X86ISD::PMULUDQ: { + APInt LHSUndef, LHSZero; + APInt RHSUndef, RHSZero; + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, + Depth + 1)) + return true; + if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, + Depth + 1)) + return true; + // Multiply by zero. + KnownZero = LHSZero | RHSZero; + break; + } case X86ISD::VSHL: case X86ISD::VSRL: case X86ISD::VSRA: { @@ -33433,7 +33449,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); return TLO.CombineTo(Op, Insert); - } + } + // Arithmetic Ops. + case X86ISD::PMULDQ: + case X86ISD::PMULUDQ: // Target Shuffles. case X86ISD::PSHUFB: case X86ISD::UNPCKL: @@ -33552,9 +33571,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( SDValue RHS = Op.getOperand(1); // FIXME: Can we bound this better? APInt DemandedMask = APInt::getLowBitsSet(64, 32); - if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1)) + if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp, + TLO, Depth + 1)) return true; - if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1)) + if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp, + TLO, Depth + 1)) return true; break; } diff --git a/test/CodeGen/X86/vector-reduce-mul-widen.ll b/test/CodeGen/X86/vector-reduce-mul-widen.ll index 4c366a61f39..a308125a767 100644 --- a/test/CodeGen/X86/vector-reduce-mul-widen.ll +++ b/test/CodeGen/X86/vector-reduce-mul-widen.ll @@ -153,13 +153,13 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -177,13 +177,13 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 -; AVX512BW-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -192,22 +192,22 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BWVL-LABEL: test_v4i64: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax ; AVX512BWVL-NEXT: vzeroupper @@ -345,13 +345,13 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -369,22 +369,22 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -402,22 +402,22 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax ; AVX512BWVL-NEXT: vzeroupper @@ -648,13 +648,13 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -680,22 +680,22 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -721,22 +721,22 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax ; AVX512BWVL-NEXT: vzeroupper diff --git a/test/CodeGen/X86/vector-reduce-mul.ll b/test/CodeGen/X86/vector-reduce-mul.ll index 12cef44b3e2..0b36b717994 100644 --- a/test/CodeGen/X86/vector-reduce-mul.ll +++ b/test/CodeGen/X86/vector-reduce-mul.ll @@ -153,13 +153,13 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -177,13 +177,13 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 -; AVX512BW-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -192,22 +192,22 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BWVL-LABEL: test_v4i64: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax ; AVX512BWVL-NEXT: vzeroupper @@ -345,13 +345,13 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -369,22 +369,22 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -402,22 +402,22 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax ; AVX512BWVL-NEXT: vzeroupper @@ -648,13 +648,13 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -680,22 +680,22 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -721,22 +721,22 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax ; AVX512BWVL-NEXT: vzeroupper -- 2.50.1