From dbb8cd704893772948072c2fc6e5e6d73a1a3597 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 8 Jul 2019 06:52:43 +0000 Subject: [PATCH] [X86] Make movsd commutable to shufpd with a 0x02 immediate on pre-SSE4.1 targets. This can help avoid a copy or enable load folding. On SSE4.1 targets we can commute it to blendi instead. I had to make shufpd with a 0x02 immediate commutable as well since we expect commuting to be reversible. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@365292 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrInfo.cpp | 50 +++++++++---- lib/Target/X86/X86InstrSSE.td | 6 +- test/CodeGen/X86/buildvec-insertvec.ll | 4 +- test/CodeGen/X86/coalesce_commute_movsd.ll | 3 +- test/CodeGen/X86/combine-sdiv.ll | 75 +++++++++---------- test/CodeGen/X86/psubus.ll | 63 ++++++++-------- test/CodeGen/X86/sdiv-exact.ll | 17 ++--- test/CodeGen/X86/sse2.ll | 5 +- test/CodeGen/X86/vector-blend.ll | 60 +++++---------- test/CodeGen/X86/vector-shift-ashr-sub128.ll | 44 +++++------ test/CodeGen/X86/vector-shuffle-128-v2.ll | 39 ++++------ test/CodeGen/X86/vector-shuffle-128-v4.ll | 9 +-- test/CodeGen/X86/vector-shuffle-128-v8.ll | 12 +-- .../X86/vector-shuffle-combining-ssse3.ll | 6 +- test/CodeGen/X86/vector-shuffle-combining.ll | 12 +-- test/CodeGen/X86/vselect-2.ll | 6 +- test/CodeGen/X86/vselect.ll | 12 +-- test/CodeGen/X86/x86-shifts.ll | 12 +-- 18 files changed, 206 insertions(+), 229 deletions(-) diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index f6f4e7d2b2c..e39819f4ac6 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -1542,20 +1542,39 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, case X86::VMOVSDrr: case X86::VMOVSSrr:{ // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD. - assert(Subtarget.hasSSE41() && "Commuting MOVSD/MOVSS requires SSE41!"); + if (Subtarget.hasSSE41()) { + unsigned Mask, Opc; + switch (MI.getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break; + case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break; + case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break; + case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break; + } - unsigned Mask, Opc; - switch (MI.getOpcode()) { - default: llvm_unreachable("Unreachable!"); - case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break; - case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break; - case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break; - case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break; + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); } + // Convert to SHUFPD. + assert(MI.getOpcode() == X86::MOVSDrr && + "Can only commute MOVSDrr without SSE4.1"); + auto &WorkingMI = cloneIfNew(MI); - WorkingMI.setDesc(get(Opc)); - WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); + WorkingMI.setDesc(get(X86::SHUFPDrri)); + WorkingMI.addOperand(MachineOperand::CreateImm(0x02)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } + case X86::SHUFPDrri: { + // Commute to MOVSD. + assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!"); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(X86::MOVSDrr)); + WorkingMI.RemoveOperand(3); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } @@ -1874,13 +1893,18 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, } return false; } - case X86::MOVSDrr: case X86::MOVSSrr: - case X86::VMOVSDrr: - case X86::VMOVSSrr: + // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can + // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since + // AVX implies sse4.1. if (Subtarget.hasSSE41()) return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); return false; + case X86::SHUFPDrri: + // We can commute this to MOVSD. + if (MI.getOperand(3).getImm() == 0x02) + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + return false; case X86::MOVHLPSrr: case X86::UNPCKHPDrr: case X86::VMOVHLPSrr: diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f904f5a23d5..d25d216db19 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1951,12 +1951,14 @@ let Predicates = [UseSSE1] in { /// sse12_shuffle - sse 1 & 2 fp shuffle instructions multiclass sse12_shuffle { + X86FoldableSchedWrite sched, Domain d, + bit IsCommutable = 0> { def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; + let isCommutable = IsCommutable in def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, @@ -1988,7 +1990,7 @@ let Constraints = "$src1 = $dst" in { memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; defm SHUFPD : sse12_shuffle, PD; + memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; } //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll index d39e60149b4..5261a2b63a8 100644 --- a/test/CodeGen/X86/buildvec-insertvec.ll +++ b/test/CodeGen/X86/buildvec-insertvec.ll @@ -65,9 +65,7 @@ entry: define <2 x double> @test_negative_zero_2(<2 x double> %A) { ; SSE2-LABEL: test_negative_zero_2: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movapd {{.*#+}} xmm1 = -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_negative_zero_2: diff --git a/test/CodeGen/X86/coalesce_commute_movsd.ll b/test/CodeGen/X86/coalesce_commute_movsd.ll index a08bc6eacb0..31537b58131 100644 --- a/test/CodeGen/X86/coalesce_commute_movsd.ll +++ b/test/CodeGen/X86/coalesce_commute_movsd.ll @@ -9,8 +9,7 @@ define <2 x double> @insert_f64(double %a0, <2 x double> %a1) { ; SSE2-LABEL: insert_f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: insert_f64: diff --git a/test/CodeGen/X86/combine-sdiv.ll b/test/CodeGen/X86/combine-sdiv.ll index 39ce0c922e3..27d49cad966 100644 --- a/test/CodeGen/X86/combine-sdiv.ll +++ b/test/CodeGen/X86/combine-sdiv.ll @@ -1529,8 +1529,7 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) { ; SSE2-NEXT: psrlq $2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64: @@ -1616,24 +1615,23 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { ; SSE2-NEXT: psrlq $2, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlq $61, %xmm0 -; SSE2-NEXT: psrlq $60, %xmm3 -; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE2-NEXT: paddq %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlq $3, %xmm0 -; SSE2-NEXT: psrlq $4, %xmm3 -; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE2-NEXT: movapd {{.*#+}} xmm0 = [1152921504606846976,576460752303423488] -; SSE2-NEXT: xorpd %xmm0, %xmm3 -; SSE2-NEXT: psubq %xmm0, %xmm3 -; SSE2-NEXT: movapd %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: psrlq $61, %xmm3 +; SSE2-NEXT: psrlq $60, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrlq $3, %xmm1 +; SSE2-NEXT: psrlq $4, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] +; SSE2-NEXT: xorpd %xmm1, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64: @@ -1745,29 +1743,28 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: psrlq $62, %xmm0 -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: psrlq $62, %xmm4 +; SSE2-NEXT: paddq %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: psrad $2, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; SSE2-NEXT: psrlq $2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: psrlq $62, %xmm2 -; SSE2-NEXT: paddq %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrlq $2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm4[1] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: psrlq $62, %xmm4 +; SSE2-NEXT: paddq %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: psrad $2, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; SSE2-NEXT: psrlq $2, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] +; SSE2-NEXT: psrlq $2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm4[1] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll index 35e78e6e560..34f4bf23ced 100644 --- a/test/CodeGen/X86/psubus.ll +++ b/test/CodeGen/X86/psubus.ll @@ -657,46 +657,47 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; ; SSSE3-LABEL: test14: ; SSSE3: # %bb.0: # %vector.ph -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm7 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: movdqa %xmm0, %xmm11 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] +; SSSE3-NEXT: movdqa %xmm11, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSSE3-NEXT: movdqa %xmm0, %xmm10 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm2, %xmm9 -; SSSE3-NEXT: pxor %xmm0, %xmm9 -; SSSE3-NEXT: psubd %xmm5, %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm7, %xmm9 +; SSSE3-NEXT: psubd %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: por %xmm7, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; SSSE3-NEXT: pshufb %xmm9, %xmm5 ; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSSE3-NEXT: pxor %xmm7, %xmm6 ; SSSE3-NEXT: psubd %xmm10, %xmm1 -; SSSE3-NEXT: por %xmm0, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm10 -; SSSE3-NEXT: pshufb %xmm9, %xmm10 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; SSSE3-NEXT: movdqa %xmm10, %xmm0 +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSSE3-NEXT: pshufb %xmm9, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSSE3-NEXT: psubd %xmm7, %xmm4 -; SSSE3-NEXT: por %xmm0, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pxor %xmm7, %xmm5 +; SSSE3-NEXT: psubd %xmm11, %xmm4 +; SSSE3-NEXT: por %xmm7, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm11 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = -; SSSE3-NEXT: pshufb %xmm5, %xmm7 +; SSSE3-NEXT: pshufb %xmm5, %xmm11 ; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: pxor %xmm0, %xmm6 -; SSSE3-NEXT: por %xmm8, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 -; SSSE3-NEXT: pshufb %xmm5, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] +; SSSE3-NEXT: pxor %xmm7, %xmm6 +; SSSE3-NEXT: por %xmm8, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufb %xmm5, %xmm7 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm7[1] ; SSSE3-NEXT: psubd %xmm8, %xmm3 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSSE3-NEXT: pand %xmm5, %xmm4 diff --git a/test/CodeGen/X86/sdiv-exact.ll b/test/CodeGen/X86/sdiv-exact.ll index c19ae822d71..3caaf40e718 100644 --- a/test/CodeGen/X86/sdiv-exact.ll +++ b/test/CodeGen/X86/sdiv-exact.ll @@ -82,17 +82,16 @@ define <4 x i32> @test5(<4 x i32> %x) { ; X86: # %bb.0: ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrad $3, %xmm1 -; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] ; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3264175145,3264175145] -; X86-NEXT: movapd %xmm0, %xmm1 -; X86-NEXT: pmuludq %xmm2, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-NEXT: movapd %xmm1, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-NEXT: pmuludq %xmm0, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl ; ; X64-LABEL: test5: diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll index b62e52a800b..c1ce6e67ed0 100644 --- a/test/CodeGen/X86/sse2.ll +++ b/test/CodeGen/X86/sse2.ll @@ -29,9 +29,8 @@ define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { ; ; X64-SSE-LABEL: test1: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movapd (%rsi), %xmm1 -; X64-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; X64-SSE-NEXT: movapd %xmm1, (%rdi) +; X64-SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] +; X64-SSE-NEXT: movapd %xmm0, (%rdi) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: test1: diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll index 5008a1e865d..549e44471d6 100644 --- a/test/CodeGen/X86/vector-blend.ll +++ b/test/CodeGen/X86/vector-blend.ll @@ -149,14 +149,12 @@ entry: define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) { ; SSE2-LABEL: vsel_double: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_double: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_double: @@ -176,14 +174,12 @@ entry: define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) { ; SSE2-LABEL: vsel_i64: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_i64: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_i64: @@ -340,20 +336,16 @@ define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) { ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movaps %xmm7, %xmm3 ; SSE2-NEXT: movaps %xmm5, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE2-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] -; SSE2-NEXT: movapd %xmm4, %xmm0 -; SSE2-NEXT: movapd %xmm6, %xmm2 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm4[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm6[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_double8: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movaps %xmm7, %xmm3 ; SSSE3-NEXT: movaps %xmm5, %xmm1 -; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] -; SSSE3-NEXT: movapd %xmm4, %xmm0 -; SSSE3-NEXT: movapd %xmm6, %xmm2 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm4[1] +; SSSE3-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm6[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_double8: @@ -379,20 +371,16 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movaps %xmm7, %xmm3 ; SSE2-NEXT: movaps %xmm5, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE2-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] -; SSE2-NEXT: movapd %xmm4, %xmm0 -; SSE2-NEXT: movapd %xmm6, %xmm2 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm4[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm6[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_i648: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movaps %xmm7, %xmm3 ; SSSE3-NEXT: movaps %xmm5, %xmm1 -; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] -; SSSE3-NEXT: movapd %xmm4, %xmm0 -; SSSE3-NEXT: movapd %xmm6, %xmm2 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm4[1] +; SSSE3-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm6[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_i648: @@ -416,18 +404,14 @@ entry: define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) { ; SSE2-LABEL: vsel_double4: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE2-NEXT: movapd %xmm2, %xmm0 -; SSE2-NEXT: movapd %xmm3, %xmm1 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm3[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_double4: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSSE3-NEXT: movapd %xmm2, %xmm0 -; SSSE3-NEXT: movapd %xmm3, %xmm1 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] +; SSSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm3[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_double4: @@ -529,15 +513,13 @@ define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) { ; SSE2-LABEL: constant_blendvpd_avx: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE2-NEXT: movapd %xmm3, %xmm1 +; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm3[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: constant_blendvpd_avx: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movaps %xmm2, %xmm0 -; SSSE3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSSE3-NEXT: movapd %xmm3, %xmm1 +; SSSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm3[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: constant_blendvpd_avx: @@ -713,14 +695,12 @@ entry: define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) { ; SSE2-LABEL: blend_shufflevector_4xdouble: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: blend_shufflevector_4xdouble: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSSE3-NEXT: movapd %xmm2, %xmm0 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: blend_shufflevector_4xdouble: diff --git a/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/test/CodeGen/X86/vector-shift-ashr-sub128.ll index fa4f4eaeaf1..8e4895849bf 100644 --- a/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -145,20 +145,21 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] -; X32-SSE-NEXT: movdqa %xmm3, %xmm0 -; X32-SSE-NEXT: psrlq %xmm1, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648] +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psrlq %xmm1, %xmm3 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] ; X32-SSE-NEXT: xorps %xmm5, %xmm5 ; X32-SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] -; X32-SSE-NEXT: psrlq %xmm5, %xmm3 -; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; X32-SSE-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE-NEXT: psrlq %xmm5, %xmm0 +; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: psrlq %xmm5, %xmm3 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X32-SSE-NEXT: xorpd %xmm3, %xmm0 -; X32-SSE-NEXT: psubq %xmm3, %xmm0 +; X32-SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm3[1] +; X32-SSE-NEXT: xorpd %xmm0, %xmm2 +; X32-SSE-NEXT: psubq %xmm0, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <2 x i32> %a, %b ret <2 x i32> %shift @@ -1057,21 +1058,22 @@ define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; X32-SSE-NEXT: psrad $31, %xmm0 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,0,4294967295,0] -; X32-SSE-NEXT: pand %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648] -; X32-SSE-NEXT: movdqa %xmm4, %xmm0 -; X32-SSE-NEXT: psrlq %xmm3, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,0,4294967295,0] +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: psrlq %xmm0, %xmm4 ; X32-SSE-NEXT: xorps %xmm5, %xmm5 ; X32-SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3] -; X32-SSE-NEXT: psrlq %xmm5, %xmm4 -; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; X32-SSE-NEXT: psrlq %xmm5, %xmm3 +; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; X32-SSE-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE-NEXT: psrlq %xmm5, %xmm1 +; X32-SSE-NEXT: psrlq %xmm0, %xmm2 +; X32-SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] +; X32-SSE-NEXT: xorpd %xmm3, %xmm2 +; X32-SSE-NEXT: psubq %xmm3, %xmm2 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE-NEXT: psrlq %xmm5, %xmm0 -; X32-SSE-NEXT: psrlq %xmm3, %xmm2 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X32-SSE-NEXT: xorpd %xmm4, %xmm0 -; X32-SSE-NEXT: psubq %xmm4, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer %shift = ashr <2 x i32> %a, %splat diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index e3f33de6103..4318e40ec66 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -222,20 +222,17 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) { define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) { ; SSE2-LABEL: shuffle_v2f64_03: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2f64_03: ; SSE3: # %bb.0: -; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2f64_03: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2f64_03: @@ -351,20 +348,17 @@ define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: shuffle_v2i64_03: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2i64_03: ; SSE3: # %bb.0: -; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2i64_03: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2i64_03: @@ -382,20 +376,20 @@ define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) { define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: shuffle_v2i64_03_copy: ; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm2, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2i64_03_copy: ; SSE3: # %bb.0: -; SSE3-NEXT: movapd %xmm2, %xmm0 -; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2i64_03_copy: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movapd %xmm2, %xmm0 -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2i64_03_copy: @@ -1085,20 +1079,17 @@ define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) { define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) { ; SSE2-LABEL: insert_reg_lo_v2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_reg_lo_v2f64: ; SSE3: # %bb.0: -; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_reg_lo_v2f64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_reg_lo_v2f64: diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index a6f003fd209..21b04ccd200 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -2255,20 +2255,17 @@ define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) { define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) { ; SSE2-LABEL: insert_reg_lo_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_reg_lo_v4f32: ; SSE3: # %bb.0: -; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_reg_lo_v4f32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_reg_lo_v4f32: diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index b441e0c1cca..87520abb060 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1260,9 +1260,9 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_032dXXXX: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,6,7] +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE2-NEXT: retq @@ -1459,9 +1459,9 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_012dcde3: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,1,2,0,4,5,6,7] +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index b42f2ac04e9..e01f5b4f576 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -43,8 +43,7 @@ define <16 x i8> @combine_vpshufb_as_movq(<16 x i8> %a0) { define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1) { ; SSSE3-LABEL: combine_pshufb_as_movsd: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_pshufb_as_movsd: @@ -669,8 +668,7 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea define <16 x i8> @combine_pshufb_pshufb_or_as_blend(<16 x i8> %a0, <16 x i8> %a1) { ; SSSE3-LABEL: combine_pshufb_pshufb_or_as_blend: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_pshufb_pshufb_or_as_blend: diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index d65d583838c..2594a0a9111 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2132,14 +2132,12 @@ define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_undef_input_test5: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_undef_input_test5: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_undef_input_test5: @@ -2316,14 +2314,12 @@ define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_undef_input_test15: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_undef_input_test15: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_undef_input_test15: diff --git a/test/CodeGen/X86/vselect-2.ll b/test/CodeGen/X86/vselect-2.ll index 3aeec1366a2..040f1d8b6f4 100644 --- a/test/CodeGen/X86/vselect-2.ll +++ b/test/CodeGen/X86/vselect-2.ll @@ -7,8 +7,7 @@ define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) { ; SSE2-LABEL: test1: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test1: @@ -46,8 +45,7 @@ define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) { define <4 x float> @test3(<4 x float> %A, <4 x float> %B) { ; SSE2-LABEL: test3: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test3: diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll index 28f94443452..46adb4cc3ca 100644 --- a/test/CodeGen/X86/vselect.ll +++ b/test/CodeGen/X86/vselect.ll @@ -30,8 +30,7 @@ define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { define <4 x float> @test2(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: test2: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test2: @@ -107,8 +106,7 @@ define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: test7: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test7: @@ -392,8 +390,7 @@ define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) { define <2 x double> @test24(<2 x double> %a, <2 x double> %b) { ; SSE2-LABEL: test24: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test24: @@ -412,8 +409,7 @@ define <2 x double> @test24(<2 x double> %a, <2 x double> %b) { define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: test25: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test25: diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll index f6191866edd..73dbb30a8c7 100644 --- a/test/CodeGen/X86/x86-shifts.ll +++ b/test/CodeGen/X86/x86-shifts.ll @@ -223,9 +223,9 @@ define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind { ; X32-NEXT: psrlq $8, %xmm2 ; X32-NEXT: movdqa %xmm0, %xmm1 ; X32-NEXT: psrlq $1, %xmm1 -; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; X32-NEXT: xorpd %xmm0, %xmm1 +; X32-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] +; X32-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] +; X32-NEXT: xorpd %xmm2, %xmm1 ; X32-NEXT: movapd %xmm1, %xmm0 ; X32-NEXT: retl ; @@ -235,9 +235,9 @@ define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind { ; X64-NEXT: psrlq $8, %xmm2 ; X64-NEXT: movdqa %xmm0, %xmm1 ; X64-NEXT: psrlq $1, %xmm1 -; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X64-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; X64-NEXT: xorpd %xmm0, %xmm1 +; X64-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] +; X64-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] +; X64-NEXT: xorpd %xmm2, %xmm1 ; X64-NEXT: movapd %xmm1, %xmm0 ; X64-NEXT: retq entry: -- 2.50.1