(AllowFloatDomain || !Subtarget.hasSSE41())) {
std::swap(V1, V2);
Shuffle = X86ISD::MOVSD;
- SrcVT = DstVT = MaskVT;
+ SrcVT = DstVT = MVT::v2f64;
return true;
}
if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
(AllowFloatDomain || !Subtarget.hasSSE41())) {
Shuffle = X86ISD::MOVSS;
- SrcVT = DstVT = MaskVT;
+ SrcVT = DstVT = MVT::v4f32;
return true;
}
}
}
}
- SDValue V0 = peekThroughBitcasts(N0);
- SDValue V1 = peekThroughBitcasts(N1);
- bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
- bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
- if (isZero0 && isZero1)
- return SDValue();
-
- // We often lower to MOVSD/MOVSS from integer as well as native float
- // types; remove unnecessary domain-crossing bitcasts if we can to make it
- // easier to combine shuffles later on. We've already accounted for the
- // domain switching cost when we decided to lower with it.
- bool isFloat = VT.isFloatingPoint();
- bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
- bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
- if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
- MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
- : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
- V0 = DAG.getBitcast(NewVT, V0);
- V1 = DAG.getBitcast(NewVT, V1);
- return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
- }
-
return SDValue();
}
case X86ISD::INSERTPS: {
(VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
}
-let Predicates = [HasAVX512, OptForSize] in {
- // Shuffle with VMOVSS
- def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
- (VMOVSSZrr (v4i32 VR128X:$src1), VR128X:$src2)>;
-
- // Shuffle with VMOVSD
- def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
- (VMOVSDZrr VR128X:$src1, VR128X:$src2)>;
-}
-
let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src),
def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>;
+def SDTShuff2OpFP : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>,
+ SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>;
def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisFP<0>, SDTCisInt<2>,
def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
-def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>;
-def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
+def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2OpFP>;
+def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2OpFP>;
-def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
-def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2OpFP>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2OpFP>;
def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
SDTCisVec<1>, SDTCisInt<1>,
(v2i64 (VMOVSDrr (v2i64 (V_SET0)),
(v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
sub_xmm)>;
-
- // Shuffle with VMOVSS
- def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
- (VMOVSSrr VR128:$src1, VR128:$src2)>;
-
- // Shuffle with VMOVSD
- def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, VR128:$src2)>;
}
let Predicates = [UseSSE1] in {
(MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
(MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
- // Shuffle with MOVSS
- def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
- (MOVSSrr VR128:$src1, VR128:$src2)>;
}
// MOVSSrm already zeros the high parts of the register.
(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
def : Pat<(v2f64 (X86vzload addr:$src)),
(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
-
- let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
- // Shuffle with MOVSD
- def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, VR128:$src2)>;
- }
}
// Aliases to help the assembler pick two byte VEX encodings by swapping the
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
- def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
- (VPBLENDWrri VR128:$src1, VR128:$src2, (i8 3))>;
- def : Pat<(v4i32 (X86Movss VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
- (VPBLENDWrmi VR128:$src1, addr:$src2, (i8 3))>;
- def : Pat<(v4i32 (X86Movss (bc_v4i32 (loadv2i64 addr:$src2)), VR128:$src1)),
- (VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xfc))>;
def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
(VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
- def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
- (VPBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>;
- def : Pat<(v2i64 (X86Movsd VR128:$src1, (loadv2i64 addr:$src2))),
- (VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf))>;
- def : Pat<(v2i64 (X86Movsd (loadv2i64 addr:$src2), VR128:$src1)),
- (VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf0))>;
// Move low f32 and clear high bits.
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
- def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
- (PBLENDWrri VR128:$src1, VR128:$src2, (i8 3))>;
- def : Pat<(v4i32 (X86Movss VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)))),
- (PBLENDWrmi VR128:$src1, addr:$src2, (i8 3))>;
- def : Pat<(v4i32 (X86Movss (bc_v4i32 (memopv2i64 addr:$src2)), VR128:$src1)),
- (PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xfc))>;
def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
(BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
(BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
(BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
- def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
- (PBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>;
- def : Pat<(v2i64 (X86Movsd VR128:$src1, (memopv2i64 addr:$src2))),
- (PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf))>;
- def : Pat<(v2i64 (X86Movsd (memopv2i64 addr:$src2), VR128:$src1)),
- (PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf0))>;
}
define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
; SSE2-LABEL: interleave_24i32_out:
; SSE2: # %bb.0:
-; SSE2-NEXT: movups 80(%rdi), %xmm5
-; SSE2-NEXT: movups 64(%rdi), %xmm8
+; SSE2-NEXT: movups 80(%rdi), %xmm9
+; SSE2-NEXT: movups 64(%rdi), %xmm10
; SSE2-NEXT: movups (%rdi), %xmm0
-; SSE2-NEXT: movups 16(%rdi), %xmm6
-; SSE2-NEXT: movups 32(%rdi), %xmm2
-; SSE2-NEXT: movups 48(%rdi), %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm8[2,3]
-; SSE2-NEXT: movaps %xmm5, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0]
-; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[2,3]
-; SSE2-NEXT: movaps %xmm2, %xmm7
-; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,0,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,1,0,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
-; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm10[0],xmm9[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSE2-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1]
+; SSE2-NEXT: movups 16(%rdi), %xmm11
+; SSE2-NEXT: movups 32(%rdi), %xmm8
+; SSE2-NEXT: movups 48(%rdi), %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm10[2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
+; SSE2-NEXT: movaps %xmm9, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm3[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0]
+; SSE2-NEXT: movaps %xmm0, %xmm5
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm11[2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT: movaps %xmm8, %xmm4
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[3,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[3,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,0]
; SSE2-NEXT: movups %xmm3, 16(%rsi)
-; SSE2-NEXT: movups %xmm4, (%rsi)
-; SSE2-NEXT: movups %xmm1, 16(%rdx)
+; SSE2-NEXT: movups %xmm5, (%rsi)
+; SSE2-NEXT: movups %xmm2, 16(%rdx)
; SSE2-NEXT: movups %xmm0, (%rdx)
-; SSE2-NEXT: movupd %xmm7, 16(%rcx)
-; SSE2-NEXT: movupd %xmm9, (%rcx)
+; SSE2-NEXT: movups %xmm7, 16(%rcx)
+; SSE2-NEXT: movups %xmm1, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i32_out:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psraw $4, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; SSE2-NEXT: movapd %xmm1, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
; SSE2-NEXT: psraw $2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm1, %xmm0
; SSE2-NEXT: psraw $1, %xmm2
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: andnps %xmm2, %xmm1
+; SSE2-NEXT: orps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v8i16:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psraw $4, %xmm1
; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; X32-SSE-NEXT: movapd %xmm1, %xmm2
+; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
; X32-SSE-NEXT: psraw $2, %xmm1
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm1
-; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; X32-SSE-NEXT: movaps %xmm2, %xmm0
+; X32-SSE-NEXT: andps %xmm1, %xmm0
; X32-SSE-NEXT: psraw $1, %xmm2
-; X32-SSE-NEXT: pandn %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: andnps %xmm2, %xmm1
+; X32-SSE-NEXT: orps %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <8 x i16> %shift
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; SSE2-NEXT: movapd %xmm1, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm1, %xmm0
; SSE2-NEXT: psrlw $1, %xmm2
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: andnps %xmm2, %xmm1
+; SSE2-NEXT: orps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v8i16:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psrlw $4, %xmm1
; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; X32-SSE-NEXT: movapd %xmm1, %xmm2
+; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
; X32-SSE-NEXT: psrlw $2, %xmm1
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm1
-; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; X32-SSE-NEXT: movaps %xmm2, %xmm0
+; X32-SSE-NEXT: andps %xmm1, %xmm0
; X32-SSE-NEXT: psrlw $1, %xmm2
-; X32-SSE-NEXT: pandn %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: andnps %xmm2, %xmm1
+; X32-SSE-NEXT: orps %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <8 x i16> %shift
; SSE2-LABEL: shuffle_v8i16_032dXXXX:
; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,0]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
; SSE2-NEXT: retq
; SSE2-LABEL: shuffle_v8i16_012dcde3:
; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,1,2,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_XX4X8acX:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_XX4X8acX: