From: Sanjay Patel Date: Sun, 12 May 2019 14:43:20 +0000 (+0000) Subject: [DAGCombiner] try to move bitcast after extract_subvector X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=874cb15d79cd15d4083e1a2b3144d18a8d2c7953;p=llvm [DAGCombiner] try to move bitcast after extract_subvector I noticed that we were failing to narrow an x86 ymm math op in a case similar to the 'madd' test diff. That is because a bitcast is sitting between the math and the extract subvector and thwarting our pattern matching for narrowing: t56: v8i32 = add t59, t58 t68: v4i64 = bitcast t56 t73: v2i64 = extract_subvector t68, Constant:i64<2> t96: v4i32 = bitcast t73 There are a few wins and neutral diffs in the other tests. Differential Revision: https://reviews.llvm.org/D61806 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360541 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 1d8f530f4c5..ed805e0750f 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17583,6 +17583,30 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { } } + // Try to move vector bitcast after extract_subv by scaling extraction index: + // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') + if (isa(Index) && V.getOpcode() == ISD::BITCAST && + V.getOperand(0).getValueType().isVector()) { + SDValue SrcOp = V.getOperand(0); + EVT SrcVT = SrcOp.getValueType(); + unsigned SrcNumElts = SrcVT.getVectorNumElements(); + unsigned DestNumElts = V.getValueType().getVectorNumElements(); + if ((SrcNumElts % DestNumElts) == 0) { + unsigned SrcDestRatio = SrcNumElts / DestNumElts; + unsigned NewExtNumElts = NVT.getVectorNumElements() * SrcDestRatio; + EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), + NewExtNumElts); + if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { + unsigned IndexValScaled = N->getConstantOperandVal(1) * SrcDestRatio; + SDLoc DL(N); + SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); + SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, + V.getOperand(0), NewIndex); + return DAG.getBitcast(NVT, NewExtract); + } + } + } + // Combine: // (extract_subvec (concat V1, V2, ...), i) // Into: diff --git a/test/CodeGen/X86/extractelement-fp.ll b/test/CodeGen/X86/extractelement-fp.ll index 9c038ed7026..7d5f18b59e8 100644 --- a/test/CodeGen/X86/extractelement-fp.ll +++ b/test/CodeGen/X86/extractelement-fp.ll @@ -26,7 +26,8 @@ define float @fneg_v4f32(<4 x float> %x) nounwind { define double @fneg_v4f64(<4 x double> %x) nounwind { ; X64-LABEL: fneg_v4f64: ; X64: # %bb.0: -; X64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; X64-NEXT: # xmm1 = mem[0,0] ; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -37,7 +38,8 @@ define double @fneg_v4f64(<4 x double> %x) nounwind { ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; X86-NEXT: # xmm1 = mem[0,0] ; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlps %xmm0, (%esp) ; X86-NEXT: fldl (%esp) diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll index 517492b848d..41f841a8ed9 100644 --- a/test/CodeGen/X86/madd.ll +++ b/test/CodeGen/X86/madd.ll @@ -2673,36 +2673,19 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX1-LABEL: madd_double_reduction: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vmovdqu (%rdx), %xmm1 -; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq -; -; AVX256-LABEL: madd_double_reduction: -; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqu (%rdi), %xmm0 -; AVX256-NEXT: vmovdqu (%rdx), %xmm1 -; AVX256-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 -; AVX256-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 -; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vmovd %xmm0, %eax -; AVX256-NEXT: vzeroupper -; AVX256-NEXT: retq +; AVX-LABEL: madd_double_reduction: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vmovdqu (%rdx), %xmm1 +; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq %tmp = load <8 x i16>, <8 x i16>* %arg, align 1 %tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1 %tmp7 = sext <8 x i16> %tmp to <8 x i32> diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll index db204a395ad..5bfb767faee 100644 --- a/test/CodeGen/X86/oddshuffles.ll +++ b/test/CodeGen/X86/oddshuffles.ll @@ -594,12 +594,12 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind { ; AVX1-NEXT: vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6] ; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,u,u,1,5,u,u,6] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3] -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; AVX1-NEXT: vmovaps %xmm0, 32(%rdi) ; AVX1-NEXT: vmovaps %ymm2, (%rdi) @@ -640,12 +640,12 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind { ; XOP: # %bb.0: ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] ; XOP-NEXT: vpermil2ps {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[u,1,5,u],ymm2[6],ymm0[6] -; XOP-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0] +; XOP-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,1,0,1] ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3] -; XOP-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1] +; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; XOP-NEXT: vmovaps %xmm0, 32(%rdi) ; XOP-NEXT: vmovaps %ymm2, (%rdi) @@ -1356,7 +1356,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 ; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,0,3,2] -; AVX1-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0,3] @@ -1365,7 +1365,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX1-NEXT: vmovups %ymm4, (%rsi) @@ -1456,7 +1456,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; XOP-NEXT: vextractf128 $1, %ymm6, %xmm7 ; XOP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3] ; XOP-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,0,3,2] -; XOP-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3] +; XOP-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,3,3] ; XOP-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] ; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0,3] @@ -1465,7 +1465,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] -; XOP-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1] ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; XOP-NEXT: vmovups %ymm4, (%rsi) @@ -1571,35 +1571,34 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; AVX1-LABEL: interleave_24i32_in: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovupd (%rsi), %ymm0 -; AVX1-NEXT: vmovupd (%rcx), %ymm1 -; AVX1-NEXT: vmovups 16(%rcx), %xmm2 -; AVX1-NEXT: vmovups (%rdx), %xmm3 -; AVX1-NEXT: vmovups 16(%rdx), %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX1-NEXT: vmovups (%rsi), %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX1-NEXT: vmovups 16(%rcx), %xmm1 +; AVX1-NEXT: vmovups (%rdx), %xmm2 +; AVX1-NEXT: vmovups 16(%rdx), %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,0],xmm1[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,1],xmm4[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,1,3,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX1-NEXT: vmovups (%rsi), %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm2[2,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm2[1,1],xmm4[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,1,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) -; AVX1-NEXT: vmovups %ymm3, (%rdi) -; AVX1-NEXT: vmovups %ymm2, 64(%rdi) +; AVX1-NEXT: vmovups %ymm2, (%rdi) +; AVX1-NEXT: vmovups %ymm1, 64(%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1681,7 +1680,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0] ; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] +; XOP-NEXT: vpermilps {{.*#+}} xmm4 = mem[0,1,0,1] ; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5] diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll index 317e5885192..7197c1cdaa0 100644 --- a/test/CodeGen/X86/sad.ll +++ b/test/CodeGen/X86/sad.ll @@ -1438,13 +1438,11 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x ; AVX2: # %bb.0: # %bb ; AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu (%rdx), %xmm1 -; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 ; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vmovdqu (%rdx), %xmm2 +; AVX2-NEXT: vpsadbw (%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1545,15 +1543,12 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* % ; AVX2-NEXT: vmovdqu (%rdx), %xmm1 ; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: sad_double_reduction: @@ -1563,8 +1558,6 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* % ; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 ; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] diff --git a/test/CodeGen/X86/vector-fshl-256.ll b/test/CodeGen/X86/vector-fshl-256.ll index dd9028dd92b..3fa183e8fe6 100644 --- a/test/CodeGen/X86/vector-fshl-256.ll +++ b/test/CodeGen/X86/vector-fshl-256.ll @@ -906,7 +906,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -1028,7 +1028,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] +; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 diff --git a/test/CodeGen/X86/vector-fshl-rot-256.ll b/test/CodeGen/X86/vector-fshl-rot-256.ll index 65fa6f20fee..f2da6053616 100644 --- a/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -519,7 +519,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] @@ -583,7 +583,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/vector-fshr-256.ll b/test/CodeGen/X86/vector-fshr-256.ll index 61b7c55e557..c3e58ae30a3 100644 --- a/test/CodeGen/X86/vector-fshr-256.ll +++ b/test/CodeGen/X86/vector-fshr-256.ll @@ -910,7 +910,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 @@ -1031,7 +1031,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] +; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 diff --git a/test/CodeGen/X86/vector-fshr-rot-256.ll b/test/CodeGen/X86/vector-fshr-rot-256.ll index 6e17724bdc1..6a2f6a889d6 100644 --- a/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -564,7 +564,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] @@ -628,7 +628,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll index aa3c647cc2f..606067f9eef 100644 --- a/test/CodeGen/X86/vector-rotate-256.ll +++ b/test/CodeGen/X86/vector-rotate-256.ll @@ -512,7 +512,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: splatvar_rotate_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64] ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -564,7 +564,7 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; XOPAVX1-LABEL: splatvar_rotate_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll index 7f6d49e7660..2f059a80765 100644 --- a/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -677,7 +677,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; XOPAVX1-LABEL: splatvar_shift_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index ff3edc3bc3b..1b9bc124c1b 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -689,7 +689,7 @@ define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) { define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0000: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -895,7 +895,7 @@ define <4 x i64> @shuffle_v4i64_0213(<4 x i64> %a, <4 x i64> %b) { define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0124: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-NEXT: retq @@ -956,7 +956,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0412: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -1110,7 +1110,7 @@ define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[0],ymm0[2],ymm2[3] -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] ; AVX1-NEXT: retq