}
}
+ // Try to move vector bitcast after extract_subv by scaling extraction index:
+ // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
+ if (isa<ConstantSDNode>(Index) && V.getOpcode() == ISD::BITCAST &&
+ V.getOperand(0).getValueType().isVector()) {
+ SDValue SrcOp = V.getOperand(0);
+ EVT SrcVT = SrcOp.getValueType();
+ unsigned SrcNumElts = SrcVT.getVectorNumElements();
+ unsigned DestNumElts = V.getValueType().getVectorNumElements();
+ if ((SrcNumElts % DestNumElts) == 0) {
+ unsigned SrcDestRatio = SrcNumElts / DestNumElts;
+ unsigned NewExtNumElts = NVT.getVectorNumElements() * SrcDestRatio;
+ EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
+ NewExtNumElts);
+ if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
+ unsigned IndexValScaled = N->getConstantOperandVal(1) * SrcDestRatio;
+ SDLoc DL(N);
+ SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
+ SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
+ V.getOperand(0), NewIndex);
+ return DAG.getBitcast(NVT, NewExtract);
+ }
+ }
+ }
+
// Combine:
// (extract_subvec (concat V1, V2, ...), i)
// Into:
define double @fneg_v4f64(<4 x double> %x) nounwind {
; X64-LABEL: fneg_v4f64:
; X64: # %bb.0:
-; X64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X64-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
+; X64-NEXT: # xmm1 = mem[0,0]
; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
; X86-NEXT: subl $8, %esp
-; X86-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
+; X86-NEXT: # xmm1 = mem[0,0]
; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovlps %xmm0, (%esp)
; X86-NEXT: fldl (%esp)
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
-; AVX1-LABEL: madd_double_reduction:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqu (%rdi), %xmm0
-; AVX1-NEXT: vmovdqu (%rdx), %xmm1
-; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
-; AVX1-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: retq
-;
-; AVX256-LABEL: madd_double_reduction:
-; AVX256: # %bb.0:
-; AVX256-NEXT: vmovdqu (%rdi), %xmm0
-; AVX256-NEXT: vmovdqu (%rdx), %xmm1
-; AVX256-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
-; AVX256-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1
-; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX256-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX256-NEXT: vmovd %xmm0, %eax
-; AVX256-NEXT: vzeroupper
-; AVX256-NEXT: retq
+; AVX-LABEL: madd_double_reduction:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqu (%rdi), %xmm0
+; AVX-NEXT: vmovdqu (%rdx), %xmm1
+; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
%tmp = load <8 x i16>, <8 x i16>* %arg, align 1
%tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1
%tmp7 = sext <8 x i16> %tmp to <8 x i32>
; AVX1-NEXT: vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6]
; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,u,u,1,5,u,u,6]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3]
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX1-NEXT: vmovaps %xmm0, 32(%rdi)
; AVX1-NEXT: vmovaps %ymm2, (%rdi)
; XOP: # %bb.0:
; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
; XOP-NEXT: vpermil2ps {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[u,1,5,u],ymm2[6],ymm0[6]
-; XOP-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0]
+; XOP-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,1,0,1]
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3]
-; XOP-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1]
+; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; XOP-NEXT: vmovaps %xmm0, 32(%rdi)
; XOP-NEXT: vmovaps %ymm2, (%rdi)
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,0,3,2]
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
; AVX1-NEXT: vmovups %ymm4, (%rsi)
; XOP-NEXT: vextractf128 $1, %ymm6, %xmm7
; XOP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3]
; XOP-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,0,3,2]
-; XOP-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; XOP-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,3,3]
; XOP-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0,3]
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; XOP-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
; XOP-NEXT: vmovups %ymm4, (%rsi)
; AVX1-LABEL: interleave_24i32_in:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovupd (%rsi), %ymm0
-; AVX1-NEXT: vmovupd (%rcx), %ymm1
-; AVX1-NEXT: vmovups 16(%rcx), %xmm2
-; AVX1-NEXT: vmovups (%rdx), %xmm3
-; AVX1-NEXT: vmovups 16(%rdx), %xmm4
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2]
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
-; AVX1-NEXT: vmovups (%rsi), %xmm4
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX1-NEXT: vmovups 16(%rcx), %xmm1
+; AVX1-NEXT: vmovups (%rdx), %xmm2
+; AVX1-NEXT: vmovups 16(%rdx), %xmm3
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,0],xmm1[3,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,1],xmm4[0,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[1,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,1,3,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
+; AVX1-NEXT: vmovups (%rsi), %xmm3
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm2[2,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm2[1,1],xmm4[0,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,1,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX1-NEXT: vmovups %ymm3, (%rdi)
-; AVX1-NEXT: vmovups %ymm2, 64(%rdi)
+; AVX1-NEXT: vmovups %ymm2, (%rdi)
+; AVX1-NEXT: vmovups %ymm1, 64(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0]
; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1]
; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0]
+; XOP-NEXT: vpermilps {{.*#+}} xmm4 = mem[0,1,0,1]
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5]
; AVX2: # %bb.0: # %bb
; AVX2-NEXT: vmovdqu (%rdi), %xmm0
; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu (%rdx), %xmm1
-; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
; AVX2-NEXT: movl $1, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vmovdqu (%rdx), %xmm2
+; AVX2-NEXT: vpsadbw (%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovdqu (%rdx), %xmm1
; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
-; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: sad_double_reduction:
; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0]
+; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0]
+; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: splatvar_rotate_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64]
; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
;
; XOPAVX1-LABEL: splatvar_rotate_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
;
; XOPAVX1-LABEL: splatvar_shift_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0000:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0124:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-NEXT: retq
define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0412:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[0],ymm0[2],ymm2[3]
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
; AVX1-NEXT: retq