From: Sanjay Patel Date: Mon, 13 May 2019 14:31:14 +0000 (+0000) Subject: [DAGCombiner] narrow vector binop with inserts/extract X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f098d456c4a2b888113b20d975ce08543bac8460;p=llvm [DAGCombiner] narrow vector binop with inserts/extract We catch most of these patterns (on x86 at least) by matching a concat vectors opcode early in combining, but the pattern may emerge later using insert subvector instead. The AVX1 diffs for add/sub overflow show another missed narrowing pattern. That one may be falling though the cracks because of combine ordering and multiple uses. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360585 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 12490e6c6ef..fcbb9fcd990 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17409,12 +17409,45 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return SDValue(); } +static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract, + SelectionDAG &DAG) { + SDValue BinOp = Extract->getOperand(0); + if (!ISD::isBinaryOp(BinOp.getNode())) + return SDValue(); + + SDValue Bop0 = BinOp.getOperand(0), Bop1 = BinOp.getOperand(1); + SDValue Index = Extract->getOperand(1); + EVT VT = Extract->getValueType(0); + bool IsInsert0 = Bop0.getOpcode() == ISD::INSERT_SUBVECTOR && + Bop0.getOperand(1).getValueType() == VT && + Bop0.getOperand(2) == Index; + bool IsInsert1 = Bop1.getOpcode() == ISD::INSERT_SUBVECTOR && + Bop1.getOperand(1).getValueType() == VT && + Bop1.getOperand(2) == Index; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // TODO: We could handle the case where only 1 operand is being inserted by + // creating an extract of the other operand, but that requires checking + // number of uses and/or costs. + if (!IsInsert0 || !IsInsert1 || + !TLI.isOperationLegalOrCustom(BinOp.getOpcode(), VT)) + return SDValue(); + + // We are inserting both operands of the wide binop only to extract back + // to the narrow vector size. Eliminate all of the insert/extract: + // ext (binop (ins ?, X, Index), (ins ?, Y, Index)), Index --> binop X, Y + return DAG.getNode(BinOp.getOpcode(), SDLoc(Extract), VT, Bop0.getOperand(1), + Bop1.getOperand(1), BinOp->getFlags()); +} + /// If we are extracting a subvector produced by a wide binary operator try /// to use a narrow binary operator and/or avoid concatenation and extraction. static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share // some of these bailouts with other transforms. + if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG)) + return V; + // The extract index must be a constant, so we can map it to a concat operand. auto *ExtractIndexC = dyn_cast(Extract->getOperand(1)); if (!ExtractIndexC) @@ -17493,7 +17526,6 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { // We need at least one concatenation operation of a binop operand to make // this transform worthwhile. The concat must double the input vector sizes. - // TODO: Should we also handle INSERT_SUBVECTOR patterns? SDValue LHS = peekThroughBitcasts(BinOp.getOperand(0)); SDValue RHS = peekThroughBitcasts(BinOp.getOperand(1)); bool ConcatL = diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll index 1f7ec8d22be..0bbae5c85e9 100644 --- a/test/CodeGen/X86/sad.ll +++ b/test/CodeGen/X86/sad.ll @@ -1522,50 +1522,19 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* % ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX1-LABEL: sad_double_reduction: -; AVX1: # %bb.0: # %bb -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vmovdqu (%rdx), %xmm1 -; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: sad_double_reduction: -; AVX2: # %bb.0: # %bb -; AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-NEXT: vmovdqu (%rdx), %xmm1 -; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512-LABEL: sad_double_reduction: -; AVX512: # %bb.0: # %bb -; AVX512-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512-NEXT: vmovdqu (%rdx), %xmm1 -; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: sad_double_reduction: +; AVX: # %bb.0: # %bb +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vmovdqu (%rdx), %xmm1 +; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq bb: %tmp = load <16 x i8>, <16 x i8>* %arg, align 1 %tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1 diff --git a/test/CodeGen/X86/vec_saddo.ll b/test/CodeGen/X86/vec_saddo.ll index dafd9e2f5e0..628f758521f 100644 --- a/test/CodeGen/X86/vec_saddo.ll +++ b/test/CodeGen/X86/vec_saddo.ll @@ -818,20 +818,20 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm6 ; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vpcmpeqd %xmm11, %xmm6, %xmm11 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm11 ; AVX1-NEXT: vpaddd %xmm9, %xmm7, %xmm9 ; AVX1-NEXT: vpcmpgtd %xmm9, %xmm5, %xmm7 ; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 ; AVX1-NEXT: vpcmpeqd %xmm7, %xmm10, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm10 -; AVX1-NEXT: vpcmpgtd %xmm10, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm10 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm12 +; AVX1-NEXT: vpcmpgtd %xmm12, %xmm5, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-NEXT: vandps %ymm3, %ymm8, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-NEXT: vandps %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vpandn %xmm8, %xmm7, %xmm6 ; AVX1-NEXT: vpackssdw %xmm6, %xmm3, %xmm8 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; AVX1-NEXT: vpcmpgtd %xmm6, %xmm5, %xmm7 @@ -839,30 +839,30 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm1, %xmm11 +; AVX1-NEXT: vpcmpeqd %xmm7, %xmm1, %xmm10 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm12 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm11 ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm7 ; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm12, %xmm7, %xmm12 -; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX1-NEXT: vpcmpeqd %xmm11, %xmm7, %xmm11 +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm11 ; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm6 ; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm6 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm2 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm7, %xmm2 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vpandn %xmm10, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm3 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 diff --git a/test/CodeGen/X86/vec_ssubo.ll b/test/CodeGen/X86/vec_ssubo.ll index 8497181cd34..d0bd0a6e3f3 100644 --- a/test/CodeGen/X86/vec_ssubo.ll +++ b/test/CodeGen/X86/vec_ssubo.ll @@ -835,48 +835,48 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpcmpgtd %xmm8, %xmm5, %xmm7 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 -; AVX1-NEXT: vpcmpgtd %xmm12, %xmm5, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm10 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm10, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm9 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm13 +; AVX1-NEXT: vpcmpgtd %xmm13, %xmm5, %xmm7 ; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm11 +; AVX1-NEXT: vpcmpeqd %xmm9, %xmm11, %xmm9 +; AVX1-NEXT: vpxor %xmm4, %xmm9, %xmm10 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm12 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm7 ; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm11, %xmm7, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm12, %xmm7, %xmm6 ; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm9 -; AVX1-NEXT: vpsubd %xmm8, %xmm12, %xmm8 +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm10 +; AVX1-NEXT: vpsubd %xmm8, %xmm13, %xmm8 ; AVX1-NEXT: vpcmpgtd %xmm8, %xmm5, %xmm6 ; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm10, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm11, %xmm6 ; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm10 -; AVX1-NEXT: vpcmpgtd %xmm10, %xmm5, %xmm3 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm11 +; AVX1-NEXT: vpcmpgtd %xmm11, %xmm5, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-NEXT: vandps %ymm3, %ymm9, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 +; AVX1-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vpandn %xmm6, %xmm9, %xmm6 ; AVX1-NEXT: vpackssdw %xmm6, %xmm3, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm13 -; AVX1-NEXT: vpcmpgtd %xmm13, %xmm5, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm14 +; AVX1-NEXT: vpcmpgtd %xmm14, %xmm5, %xmm7 ; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm11 +; AVX1-NEXT: vpcmpeqd %xmm7, %xmm1, %xmm12 +; AVX1-NEXT: vpxor %xmm4, %xmm12, %xmm10 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm12 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm13 ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm7 ; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm12, %xmm7, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm13, %xmm7, %xmm6 ; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm11 -; AVX1-NEXT: vpsubd %xmm13, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm10 +; AVX1-NEXT: vpsubd %xmm14, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm6 ; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm1, %xmm1 @@ -886,13 +886,13 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm7, %xmm2 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-NEXT: vpandn %xmm1, %xmm12, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpacksswb %xmm9, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm3 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 diff --git a/test/CodeGen/X86/vector-narrow-binop.ll b/test/CodeGen/X86/vector-narrow-binop.ll index c3ba795a6b5..aaeb54dd72e 100644 --- a/test/CodeGen/X86/vector-narrow-binop.ll +++ b/test/CodeGen/X86/vector-narrow-binop.ll @@ -178,23 +178,21 @@ define <4 x double> @fmul_v2f64(<2 x double> %x, <2 x double> %y) { ; ; AVX1-LABEL: fmul_v2f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: vmulpd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmulpd %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: fmul_v2f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: vmulpd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmulpd %xmm2, %xmm2, %xmm1 +; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: retq ; @@ -202,10 +200,8 @@ define <4 x double> @fmul_v2f64(<2 x double> %x, <2 x double> %y) { ; AVX512: # %bb.0: ; AVX512-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-NEXT: vmulpd %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulpd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm0 = (xmm2 * xmm2) + xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: retq %s = shufflevector <2 x double> %x, <2 x double> %y, <4 x i32> diff --git a/test/CodeGen/X86/vector-reduce-xor-bool.ll b/test/CodeGen/X86/vector-reduce-xor-bool.ll index 9cfa1dc7561..29d032b4b44 100644 --- a/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -1969,22 +1969,20 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>) { ; ; AVX1-LABEL: icmp_v64i8_v64i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1