assert(EltVT == MVT::f32 || EltVT == MVT::f64);
#endif
- unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
- unsigned Opc = X86ISD::CMPP;
+ unsigned Opc;
if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
assert(VT.getVectorNumElements() <= 16);
Opc = X86ISD::CMPM;
- }
- // In the two special cases we can't handle, emit two comparisons.
+ } else {
+ Opc = X86ISD::CMPP;
+ // The SSE/AVX packed FP comparison nodes are defined with a
+ // floating-point vector result that matches the operand type. This allows
+ // them to work with an SSE1 target (integer vector types are not legal).
+ VT = Op0.getSimpleValueType();
+ }
+
+ // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
+ // emit two comparisons and a logic op to tie them together.
+ // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
+ // available.
+ SDValue Cmp;
+ unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
if (SSECC == 8) {
+ // LLVM predicate is SETUEQ or SETONE.
unsigned CC0, CC1;
unsigned CombineOpc;
if (SetCCOpcode == ISD::SETUEQ) {
- CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
+ CC0 = 3; // UNORD
+ CC1 = 0; // EQ
+ CombineOpc = Opc == X86ISD::CMPP ? X86ISD::FOR : ISD::OR;
} else {
assert(SetCCOpcode == ISD::SETONE);
- CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
+ CC0 = 7; // ORD
+ CC1 = 4; // NEQ
+ CombineOpc = Opc == X86ISD::CMPP ? X86ISD::FAND : ISD::AND;
}
SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
DAG.getConstant(CC0, dl, MVT::i8));
SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
DAG.getConstant(CC1, dl, MVT::i8));
- return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+ Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+ } else {
+ // Handle all other FP comparisons here.
+ Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(SSECC, dl, MVT::i8));
}
- // Handle all other FP comparisons here.
- return DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(SSECC, dl, MVT::i8));
+
+ // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
+ // result type of SETCC. The bitcast is expected to be optimized away
+ // during combining/isel.
+ if (Opc == X86ISD::CMPP)
+ Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+
+ return Cmp;
}
MVT VTOp0 = Op0.getSimpleValueType();
}
}
+ // For an SSE1-only target, lower to X86ISD::CMPP early to avoid scalarization
+ // via legalization because v4i32 is not a legal type.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32)
+ return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
+
return SDValue();
}
// SSE specific DAG Nodes.
//===----------------------------------------------------------------------===//
-def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>,
SDTCisFP<1>, SDTCisVT<3, i8>,
SDTCisVec<1>]>;
def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
}
let Predicates = [HasAVX] in {
-def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
(VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
(VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
-def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
(VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
(VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
-def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
+def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
(VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
-def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
+def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
(VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
-def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
+def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
(VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
-def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
+def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
(VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
}
let Predicates = [UseSSE1] in {
-def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
(CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
(CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
}
let Predicates = [UseSSE2] in {
-def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
(CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
(CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
}
define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind {
; CHECK-LABEL: PR28044:
; CHECK: # BB#0:
-; CHECK: movaps %xmm1, %xmm2
-; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; CHECK-NEXT: movaps %xmm0, %xmm3
-; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; CHECK-NEXT: ucomiss %xmm2, %xmm3
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: andb %al, %cl
-; CHECK-NEXT: movzbl %cl, %eax
-; CHECK-NEXT: shll $31, %eax
-; CHECK-NEXT: sarl $31, %eax
-; CHECK-NEXT: movl %eax,
-; CHECK-NEXT: movaps %xmm1, %xmm2
-; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; CHECK-NEXT: movaps %xmm0, %xmm3
-; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; CHECK-NEXT: ucomiss %xmm2, %xmm3
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: andb %al, %cl
-; CHECK-NEXT: movzbl %cl, %eax
-; CHECK-NEXT: shll $31, %eax
-; CHECK-NEXT: sarl $31, %eax
-; CHECK-NEXT: movl %eax,
-; CHECK-NEXT: ucomiss %xmm1, %xmm0
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: andb %al, %cl
-; CHECK-NEXT: movzbl %cl, %eax
-; CHECK-NEXT: shll $31, %eax
-; CHECK-NEXT: sarl $31, %eax
-; CHECK-NEXT: movl %eax,
-; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; CHECK-NEXT: ucomiss %xmm1, %xmm0
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: andb %al, %cl
-; CHECK-NEXT: movzbl %cl, %eax
-; CHECK-NEXT: shll $31, %eax
-; CHECK-NEXT: sarl $31, %eax
-; CHECK-NEXT: movl %eax,
-; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: cmpeqps %xmm1, %xmm0
+; CHECK-NEXT: ret
;
%cmp = fcmp oeq <4 x float> %a0, %a1
%sext = sext <4 x i1> %cmp to <4 x i32>