From: Chandler Carruth Date: Sun, 5 Oct 2014 22:57:31 +0000 (+0000) Subject: [x86] Remove the 2-addr-to-3-addr "optimization" from shufps to pshufd. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1d02acb7a0330ad68bce53e80e5538bedfcb6c2f;p=llvm [x86] Remove the 2-addr-to-3-addr "optimization" from shufps to pshufd. This trades a (register-renamer-friendly) movaps for a floating point / integer domain cross. That is a very bad trade, even on architectures where domain crossing is relatively fast. On any chip where there is even a cycle stall, this is a Very Bad Idea. It doesn't even seem likely to cause a spill to be introduced because the reason for the copy is to destructively shuffle in place. Thanks to Ben Kramer for fixing a bug in this code that my new shuffle lowering exposed and highlighting that perhaps it should just go away. =] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@219090 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index dc97185ed95..a7637f9e65d 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2139,34 +2139,6 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned MIOpc = MI->getOpcode(); switch (MIOpc) { - case X86::SHUFPSrri: { - assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!"); - if (!Subtarget.hasSSE2()) return nullptr; - - unsigned B = MI->getOperand(1).getReg(); - unsigned C = MI->getOperand(2).getReg(); - if (B != C) return nullptr; - int64_t M = MI->getOperand(3).getImm(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) - .addOperand(Dest).addOperand(Src).addImm(M); - break; - } - case X86::SHUFPDrri: { - assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!"); - if (!Subtarget.hasSSE2()) return nullptr; - - unsigned B = MI->getOperand(1).getReg(); - unsigned C = MI->getOperand(2).getReg(); - if (B != C) return nullptr; - unsigned M = MI->getOperand(3).getImm(); - - // Convert to PSHUFD mask. - M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44; - - NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) - .addOperand(Dest).addOperand(Src).addImm(M); - break; - } case X86::SHL64ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); diff --git a/test/CodeGen/X86/3addr-shufps.ll b/test/CodeGen/X86/3addr-shufps.ll deleted file mode 100644 index 8603df9a7ab..00000000000 --- a/test/CodeGen/X86/3addr-shufps.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin13 -mcpu=pentium4 | FileCheck %s - -define <4 x float> @test1(<4 x i32>, <4 x float> %b) { - %s = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> - ret <4 x float> %s - -; We convert shufps -> pshufd here to save a move. -; CHECK-LABEL: test1: -; CHECK: pshufd $-27, %xmm1, %xmm0 -; CHECK-NEXT: ret -} diff --git a/test/CodeGen/X86/sincos-opt.ll b/test/CodeGen/X86/sincos-opt.ll index 434ee9b3d0c..1e34a2be10b 100644 --- a/test/CodeGen/X86/sincos-opt.ll +++ b/test/CodeGen/X86/sincos-opt.ll @@ -15,7 +15,8 @@ entry: ; OSX_SINCOS-LABEL: test1: ; OSX_SINCOS: callq ___sincosf_stret -; OSX_SINCOS: pshufd {{.*}} ## xmm1 = xmm0[1,1,2,3] +; OSX_SINCOS: movaps %xmm0, %xmm1 +; OSX_SINCOS: shufps {{.*}} ## xmm1 = xmm1[1,1,2,3] ; OSX_SINCOS: addss %xmm0, %xmm1 ; OSX_NOOPT: test1 diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index 76fcff63501..cbeb20e859a 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -291,7 +291,8 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X32-LABEL: buildvector: ; X32: ## BB#0: ## %entry -; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; X32-NEXT: movaps %xmm0, %xmm2 +; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] ; X32-NEXT: addss %xmm1, %xmm0 ; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: addss %xmm2, %xmm1 @@ -300,7 +301,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; ; X64-LABEL: buildvector: ; X64: ## BB#0: ## %entry -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; X64-NEXT: movaps %xmm0, %xmm2 +; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] ; X64-NEXT: addss %xmm1, %xmm0 ; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: addss %xmm2, %xmm1 diff --git a/test/CodeGen/X86/v2f32.ll b/test/CodeGen/X86/v2f32.ll index d73b9da0f2c..b9bd80f949e 100644 --- a/test/CodeGen/X86/v2f32.ll +++ b/test/CodeGen/X86/v2f32.ll @@ -5,7 +5,8 @@ define void @test1(<2 x float> %Q, float *%P2) nounwind { ; X64-LABEL: test1: ; X64: # BB#0: -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-NEXT: movaps %xmm0, %xmm1 +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X64-NEXT: addss %xmm0, %xmm1 ; X64-NEXT: movss %xmm1, (%rdi) ; X64-NEXT: retq @@ -13,7 +14,8 @@ define void @test1(<2 x float> %Q, float *%P2) nounwind { ; X32-LABEL: test1: ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X32-NEXT: movaps %xmm0, %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: addss %xmm0, %xmm1 ; X32-NEXT: movss %xmm1, (%eax) ; X32-NEXT: retl diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index 59041367bba..e743194ef70 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -186,7 +186,8 @@ define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) { define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: shuffle_v2f64_32: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v2f64_32: diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index 87b05202d9a..8ad9e5a11d9 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -396,14 +396,16 @@ define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; SSE2: # BB#0: ; SSE2-NEXT: andps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test4b: ; SSSE3: # BB#0: ; SSSE3-NEXT: andps %xmm1, %xmm0 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: movaps %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test4b: @@ -435,14 +437,16 @@ define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; SSE2: # BB#0: ; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test5b: ; SSSE3: # BB#0: ; SSSE3-NEXT: orps %xmm1, %xmm0 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: movaps %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test5b: @@ -1124,7 +1128,8 @@ define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test1: @@ -1252,7 +1257,8 @@ define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test6: @@ -1651,7 +1657,8 @@ define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test2b: @@ -1660,7 +1667,8 @@ define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: movaps %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test2b: @@ -1668,7 +1676,8 @@ define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { ; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,1] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test2b: