DAG.getIntPtrConstant(0, dl));
}
+static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
+ const X86Subtarget &Subtarget) {
+ switch (Opcode) {
+ case ISD::SINT_TO_FP:
+ // TODO: Handle wider types with AVX/AVX512.
+ if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
+ return false;
+ // CVTDQ2PS or (V)CVTDQ2PD
+ return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
+
+ case ISD::UINT_TO_FP:
+ // TODO: Handle wider types and i64 elements.
+ if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
+ return false;
+ // VCVTUDQ2PS or VCVTUDQ2PD
+ return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
+
+ default:
+ return false;
+ }
+}
+
+/// Given a scalar cast operation that is extracted from a vector, try to
+/// vectorize the cast op followed by extraction. This will avoid an expensive
+/// round-trip between XMM and GPR.
+static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // TODO: The limitation for extracting from the 0-element is not required,
+ // but if we extract from some other element, it will require shuffling to
+ // get the result into the right place.
+ // TODO: This could be enhanced to handle smaller integer types by peeking
+ // through an extend.
+ SDValue Extract = Cast.getOperand(0);
+ MVT DestVT = Cast.getSimpleValueType();
+ if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isNullConstant(Extract.getOperand(1)))
+ return SDValue();
+
+ SDValue VecOp = Extract.getOperand(0);
+ MVT FromVT = VecOp.getSimpleValueType();
+ MVT ToVT = MVT::getVectorVT(DestVT, FromVT.getVectorNumElements());
+ if (!useVectorCast(Cast.getOpcode(), FromVT, ToVT, Subtarget))
+ return SDValue();
+
+ // cast (extract V, Y) --> extract (cast V), Y
+ SDLoc DL(Cast);
+ SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
+ Extract.getOperand(1));
+}
+
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
+ if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
+ return Extract;
+
if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
if (Op.getSimpleValueType().isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
+ if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
+ return Extract;
+
MVT SrcVT = N0.getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,32768,0,0,1,0,0,0]
; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; X32-NEXT: vmovd %xmm0, %eax
-; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsllq $20, %xmm0, %xmm0
-; X32-NEXT: vmovd %xmm0, %eax
-; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
; X32-NEXT: vmovd %eax, %xmm0
; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; X32-NEXT: vpsrlq $3, %xmm0, %xmm0
-; X32-NEXT: vmovd %xmm0, %eax
-; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: vpand %xmm1, %xmm0, %xmm0
-; X32-NEXT: vmovd %xmm0, %eax
-; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
; X32-NEXT: vpand %xmm1, %xmm0, %xmm2
; X32-NEXT: vpor %xmm1, %xmm2, %xmm1
; X32-NEXT: vpxor %xmm0, %xmm1, %xmm0
-; X32-NEXT: vmovd %xmm0, %eax
-; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
define float @extract0_sitofp_v4i32_f32(<4 x i32> %x) nounwind {
; SSE-LABEL: extract0_sitofp_v4i32_f32:
; SSE: # %bb.0:
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2ssl %eax, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: extract0_sitofp_v4i32_f32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
%r = sitofp i32 %e to float
; SSE-LABEL: extract0_sitofp_v4i32_f32i_multiuse1:
; SSE: # %bb.0:
; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2ssl %eax, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: incl %eax
; SSE-NEXT: cvtsi2ssl %eax, %xmm1
; SSE-NEXT: divss %xmm1, %xmm0
; AVX-LABEL: extract0_sitofp_v4i32_f32i_multiuse1:
; AVX: # %bb.0:
; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: incl %eax
; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm1
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
define float @extract0_sitofp_v4i32_f32_multiuse2(<4 x i32> %x, i32* %p) nounwind {
; SSE-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
; SSE: # %bb.0:
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: cvtsi2ssl %eax, %xmm1
-; SSE-NEXT: movd %xmm0, (%rdi)
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
+; SSE-NEXT: movss %xmm0, (%rdi)
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm1
-; AVX-NEXT: vmovd %xmm0, (%rdi)
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm1
+; AVX-NEXT: vmovss %xmm0, (%rdi)
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
;
; AVX-LABEL: extract0_sitofp_v4i32_f64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vcvtsi2sdl %eax, %xmm1, %xmm0
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
%r = sitofp i32 %e to double
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm0
; VEX-NEXT: retq
;
-; AVX512-LABEL: extract0_uitofp_v4i32_f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vcvtusi2ssl %eax, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: extract0_uitofp_v4i32_f32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: extract0_uitofp_v4i32_f32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: extract0_uitofp_v4i32_f32:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f32:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
%r = uitofp i32 %e to float
ret float %r
; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm0
; VEX-NEXT: retq
;
-; AVX512-LABEL: extract0_uitofp_v4i32_f64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vcvtusi2sdl %eax, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: extract0_uitofp_v4i32_f64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: extract0_uitofp_v4i32_f64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0
+; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: extract0_uitofp_v4i32_f64:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f64:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0
+; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VLDQ-NEXT: vzeroupper
+; AVX512VLDQ-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
%r = uitofp i32 %e to double
ret double %r
; SSE2-LABEL: extract3_sitofp_v4i32_f32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ssl %eax, %xmm0
+; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract3_sitofp_v4i32_f32: