From 9a476793c5d0a00c273b7582df09976b68e57baf Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sun, 12 Jun 2016 15:03:25 +0000 Subject: [PATCH] [x86, SSE] change patterns for CMPP to float types to allow matching with SSE1 (PR28044) This patch is intended to solve: https://llvm.org/bugs/show_bug.cgi?id=28044 By changing the definition of X86ISD::CMPP to use float types, we allow it to be created and pass legalization for an SSE1-only target where v4i32 is not legal. The motivational trail for this change includes: https://llvm.org/bugs/show_bug.cgi?id=28001 and eventually makes this trigger: http://reviews.llvm.org/D21190 Ie, after this step, we should be free to have Clang generate FP compare IR instead of x86 intrinsics for SSE C packed compare intrinsics. (We can auto-upgrade and remove the LLVM sse.cmp intrinsics as a follow-up step.) Once we're generating vector IR instead of x86 intrinsics, a big pile of generic optimizations can trigger. Differential Revision: http://reviews.llvm.org/D21235 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@272511 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 50 +++++++++++++++++++----- lib/Target/X86/X86InstrFragmentsSIMD.td | 2 +- lib/Target/X86/X86InstrSSE.td | 24 ++++++------ test/CodeGen/X86/sse1.ll | 51 +------------------------ 4 files changed, 55 insertions(+), 72 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6c0a3af2351..30c3d531fb5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -15168,32 +15168,57 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif - unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); - unsigned Opc = X86ISD::CMPP; + unsigned Opc; if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16); Opc = X86ISD::CMPM; - } - // In the two special cases we can't handle, emit two comparisons. + } else { + Opc = X86ISD::CMPP; + // The SSE/AVX packed FP comparison nodes are defined with a + // floating-point vector result that matches the operand type. This allows + // them to work with an SSE1 target (integer vector types are not legal). + VT = Op0.getSimpleValueType(); + } + + // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), + // emit two comparisons and a logic op to tie them together. + // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is + // available. + SDValue Cmp; + unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); if (SSECC == 8) { + // LLVM predicate is SETUEQ or SETONE. unsigned CC0, CC1; unsigned CombineOpc; if (SetCCOpcode == ISD::SETUEQ) { - CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; + CC0 = 3; // UNORD + CC1 = 0; // EQ + CombineOpc = Opc == X86ISD::CMPP ? X86ISD::FOR : ISD::OR; } else { assert(SetCCOpcode == ISD::SETONE); - CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; + CC0 = 7; // ORD + CC1 = 4; // NEQ + CombineOpc = Opc == X86ISD::CMPP ? X86ISD::FAND : ISD::AND; } SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC0, dl, MVT::i8)); SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC1, dl, MVT::i8)); - return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); + Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); + } else { + // Handle all other FP comparisons here. + Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getConstant(SSECC, dl, MVT::i8)); } - // Handle all other FP comparisons here. - return DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(SSECC, dl, MVT::i8)); + + // If this is SSE/AVX CMPP, bitcast the result back to integer to match the + // result type of SETCC. The bitcast is expected to be optimized away + // during combining/isel. + if (Opc == X86ISD::CMPP) + Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); + + return Cmp; } MVT VTOp0 = Op0.getSimpleValueType(); @@ -29647,6 +29672,11 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, } } + // For an SSE1-only target, lower to X86ISD::CMPP early to avoid scalarization + // via legalization because v4i32 is not a legal type. + if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) + return LowerVSETCC(SDValue(N, 0), Subtarget, DAG); + return SDValue(); } diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 958bb822a06..e2155972cc5 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -35,7 +35,7 @@ def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>; // SSE specific DAG Nodes. //===----------------------------------------------------------------------===// -def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, +def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisFP<1>, SDTCisVT<3, i8>, SDTCisVec<1>]>; def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index d588fba1dad..661f733a1b9 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2498,36 +2498,36 @@ let Constraints = "$src1 = $dst" in { } let Predicates = [HasAVX] in { -def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), +def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; -def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)), +def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)), (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; -def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), +def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; -def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)), +def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)), (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; -def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)), +def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)), (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>; -def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)), +def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)), (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>; -def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)), +def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)), (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>; -def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)), +def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)), (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; } let Predicates = [UseSSE1] in { -def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), +def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; -def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)), +def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)), (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; } let Predicates = [UseSSE2] in { -def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), +def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; -def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)), +def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)), (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; } diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll index 6ed3c1c48d6..29c041ba7f6 100644 --- a/test/CodeGen/X86/sse1.ll +++ b/test/CodeGen/X86/sse1.ll @@ -53,55 +53,8 @@ entry: define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind { ; CHECK-LABEL: PR28044: ; CHECK: # BB#0: -; CHECK: movaps %xmm1, %xmm2 -; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] -; CHECK-NEXT: ucomiss %xmm2, %xmm3 -; CHECK-NEXT: setnp %al -; CHECK-NEXT: sete %cl -; CHECK-NEXT: andb %al, %cl -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: shll $31, %eax -; CHECK-NEXT: sarl $31, %eax -; CHECK-NEXT: movl %eax, -; CHECK-NEXT: movaps %xmm1, %xmm2 -; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3] -; CHECK-NEXT: ucomiss %xmm2, %xmm3 -; CHECK-NEXT: setnp %al -; CHECK-NEXT: sete %cl -; CHECK-NEXT: andb %al, %cl -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: shll $31, %eax -; CHECK-NEXT: sarl $31, %eax -; CHECK-NEXT: movl %eax, -; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: setnp %al -; CHECK-NEXT: sete %cl -; CHECK-NEXT: andb %al, %cl -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: shll $31, %eax -; CHECK-NEXT: sarl $31, %eax -; CHECK-NEXT: movl %eax, -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1,2,3] -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,2,3] -; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: setnp %al -; CHECK-NEXT: sete %cl -; CHECK-NEXT: andb %al, %cl -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: shll $31, %eax -; CHECK-NEXT: sarl $31, %eax -; CHECK-NEXT: movl %eax, -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: cmpeqps %xmm1, %xmm0 +; CHECK-NEXT: ret ; %cmp = fcmp oeq <4 x float> %a0, %a1 %sext = sext <4 x i1> %cmp to <4 x i32> -- 2.50.1