Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
def int_x86_sse2_cvtps2dq : GCCBuiltin<"__builtin_ia32_cvtps2dq">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+ def int_x86_sse2_cvttps2dq : GCCBuiltin<"__builtin_ia32_cvttps2dq">,
+ Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
def int_x86_sse2_cvtsd2si : GCCBuiltin<"__builtin_ia32_cvtsd2si">,
Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
def int_x86_sse2_cvtsd2si64 : GCCBuiltin<"__builtin_ia32_cvtsd2si64">,
Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">,
Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+ def int_x86_avx_cvtt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2dq256">,
+ Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
def int_x86_avx_cvt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2dq256">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+ def int_x86_avx_cvtt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvttps2dq256">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
}
// Vector bit test
/// integer type Ty is used to select how many bits are available for the
/// result. Returns null if the conversion cannot be performed, otherwise
/// returns the Constant value resulting from the conversion.
-Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
- Type *Ty) {
+Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero,
+ Type *Ty) {
// All of these conversion intrinsics form an integer of at most 64bits.
unsigned ResultWidth = Ty->getIntegerBitWidth();
assert(ResultWidth <= 64 &&
APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
/*isSigned=*/true, mode,
&isExact);
- if (status != APFloat::opOK && status != APFloat::opInexact)
+ if (status != APFloat::opOK &&
+ (!roundTowardZero || status != APFloat::opInexact))
return nullptr;
return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
}
case Intrinsic::x86_sse2_cvtsd2si:
case Intrinsic::x86_sse2_cvtsd2si64:
if (ConstantFP *FPOp =
- dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
- return ConstantFoldConvertToInt(FPOp->getValueAPF(),
- /*roundTowardZero=*/false, Ty);
+ dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
+ return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
+ /*roundTowardZero=*/false, Ty);
case Intrinsic::x86_sse_cvttss2si:
case Intrinsic::x86_sse_cvttss2si64:
case Intrinsic::x86_sse2_cvttsd2si:
case Intrinsic::x86_sse2_cvttsd2si64:
if (ConstantFP *FPOp =
- dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
- return ConstantFoldConvertToInt(FPOp->getValueAPF(),
- /*roundTowardZero=*/true, Ty);
+ dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
+ return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
+ /*roundTowardZero=*/true, Ty);
}
}
Name == "sse2.cvtps2pd" ||
Name == "avx.cvtdq2.pd.256" ||
Name == "avx.cvt.ps2.pd.256" ||
- Name == "sse2.cvttps2dq" ||
- Name.startswith("avx.cvtt.") ||
Name.startswith("avx.vinsertf128.") ||
Name == "avx2.vinserti128" ||
Name.startswith("avx.vextractf128.") ||
Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd");
else
Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
- } else if (IsX86 && (Name == "sse2.cvttps2dq" ||
- Name.startswith("avx.cvtt."))) {
- // Truncation (round to zero) float/double to i32 vector conversion.
- Value *Src = CI->getArgOperand(0);
- VectorType *DstTy = cast<VectorType>(CI->getType());
- Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt");
} else if (IsX86 && Name.startswith("sse4a.movnt.")) {
Module *M = F->getParent();
SmallVector<Metadata *, 1> Elts;
(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
Sched<[WriteCvtF2F]>;
-def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
Sched<[WriteCvtF2F]>;
-def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"cvtsd2ss\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
// SSE2 packed instructions with XS prefix
def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+ (loadv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+ [(set VR256:$dst,
+ (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+ [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
+ (loadv8f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
Sched<[WriteCvtF2ILd]>;
def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
let Predicates = [HasAVX] in {
def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
// YMM only
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+ [(set VR128:$dst,
+ (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+ [(set VR128:$dst,
+ (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
; X64-NEXT: vcvttpd2dqy %ymm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
- %cvt = fptosi <4 x double> %a0 to <4 x i32>
+ %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
%res = bitcast <4 x i32> %cvt to <2 x i64>
ret <2 x i64> %res
}
+declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_cvttps_epi32:
; X64: # BB#0:
; X64-NEXT: vcvttps2dq %ymm0, %ymm0
; X64-NEXT: retq
- %cvt = fptosi <8 x float> %a0 to <8 x i32>
+ %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
%res = bitcast <8 x i32> %cvt to <4 x i64>
ret <4 x i64> %res
}
+declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_div_pd:
declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
-define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttpd2dqy %ymm0, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
- %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
-; CHECK-NEXT: retl
- %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
-
-
define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
; add operation forces the execution domain.
; CHECK-LABEL: test_x86_sse2_storeu_dq:
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0
+; CHECK-NEXT: vpaddb LCPI32_0, %xmm0, %xmm0
; CHECK-NEXT: vmovdqu %xmm0, (%eax)
; CHECK-NEXT: retl
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
+define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
+; AVX-LABEL: test_x86_avx_cvtt_pd2dq_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvttpd2dqy %ymm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvttpd2dqy %ymm0, %xmm0
+; AVX512VL-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
+; AVX-LABEL: test_x86_avx_cvtt_ps2dq_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvttps2dq %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvtt_ps2dq_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0
+; AVX512VL-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
+
+
define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
; AVX-LABEL: test_x86_avx_dp_ps_256:
; AVX: ## BB#0:
; AVX-LABEL: movnt_dq:
; AVX: ## BB#0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
+; AVX-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0
; AVX-NEXT: vmovntdq %ymm0, (%eax)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retl
; AVX512VL-LABEL: movnt_dq:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX512VL-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
+; AVX512VL-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovntdq %ymm0, (%eax)
; AVX512VL-NEXT: retl
%a2 = add <2 x i64> %a1, <i64 1, i64 1>
define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind {
; X64-LABEL: test_mm_cvtsi64_ss:
; X64: # BB#0:
-; X64-NEXT: cvtsi2ssq %rdi, %xmm1
-; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: cvtsi2ssq %rdi, %xmm0
; X64-NEXT: retq
- %cvt = sitofp i64 %a1 to float
- %res = insertelement <4 x float> %a0, float %cvt, i32 0
+ %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1)
ret <4 x float> %res
}
+declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind {
; X64-LABEL: test_mm_cvtss_si64:
; X64: # BB#0:
; X64-NEXT: cvttss2si %xmm0, %rax
; X64-NEXT: retq
- %cvt = extractelement <4 x float> %a0, i32 0
- %res = fptosi float %cvt to i64
+ %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
ret i64 %res
}
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
; X32-LABEL: test_mm_cvtsi32_ss:
; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: cvtsi2ssl %eax, %xmm1
-; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtsi32_ss:
; X64: # BB#0:
-; X64-NEXT: cvtsi2ssl %edi, %xmm1
-; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: cvtsi2ssl %edi, %xmm0
; X64-NEXT: retq
- %cvt = sitofp i32 %a1 to float
- %res = insertelement <4 x float> %a0, float %cvt, i32 0
+ %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
ret <4 x float> %res
}
+declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvtss_f32:
; X64: # BB#0:
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: retq
- %cvt = extractelement <4 x float> %a0, i32 0
- %res = fptosi float %cvt to i32
+ %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
ret i32 %res
}
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvttss_si32:
; X64: # BB#0:
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: retq
- %cvt = extractelement <4 x float> %a0, i32 0
- %res = fptosi float %cvt to i32
+ %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
ret i32 %res
}
define <2 x double> @test_mm_cvtsi64_sd(<2 x double> %a0, i64 %a1) nounwind {
; X64-LABEL: test_mm_cvtsi64_sd:
; X64: # BB#0:
-; X64-NEXT: cvtsi2sdq %rdi, %xmm1
-; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: cvtsi2sdq %rdi, %xmm0
; X64-NEXT: retq
- %cvt = sitofp i64 %a1 to double
- %res = insertelement <2 x double> %a0, double %cvt, i32 0
+ %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1)
ret <2 x double> %res
}
+declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind {
; X64-LABEL: test_mm_cvtsi64_si128:
; X64: # BB#0:
; X64-NEXT: cvttsd2si %xmm0, %rax
; X64-NEXT: retq
- %ext = extractelement <2 x double> %a0, i32 0
- %res = fptosi double %ext to i64
+ %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
ret i64 %res
}
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
define <2 x i64> @test_mm_loadu_si64(i64* %a0) nounwind {
; X64-LABEL: test_mm_loadu_si64:
}
declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
+define <4 x float> @test_mm_cvtsd_ss(<4 x float> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_cvtsd_ss:
+; X32: # BB#0:
+; X32-NEXT: cvtsd2ss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsd_ss:
+; X64: # BB#0:
+; X64-NEXT: cvtsd2ss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_cvtsd_ss_load(<4 x float> %a0, <2 x double>* %p1) {
+; X32-LABEL: test_mm_cvtsd_ss_load:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm1
+; X32-NEXT: cvtsd2ss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsd_ss_load:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm1
+; X64-NEXT: cvtsd2ss %xmm1, %xmm0
+; X64-NEXT: retq
+ %a1 = load <2 x double>, <2 x double>* %p1
+ %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1)
+ ret <4 x float> %res
+}
+
define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_cvtsi128_si32:
; X32: # BB#0:
; X64: # BB#0:
; X64-NEXT: cvttps2dq %xmm0, %xmm0
; X64-NEXT: retq
- %res = fptosi <4 x float> %a0 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0)
%bc = bitcast <4 x i32> %res to <2 x i64>
ret <2 x i64> %bc
}
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_cvttsd_si32:
; X64: # BB#0:
; X64-NEXT: cvttsd2si %xmm0, %eax
; X64-NEXT: retq
- %ext = extractelement <2 x double> %a0, i32 0
- %res = fptosi double %ext to i32
+ %res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
ret i32 %res
}
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_div_pd:
declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
-define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvttps2dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: retl
- %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
-
-
define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_sse2_storel_dq:
; CHECK: ## BB#0:
; CHECK-LABEL: test_x86_sse2_storeu_dq:
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: paddb LCPI8_0, %xmm0
+; CHECK-NEXT: paddb LCPI7_0, %xmm0
; CHECK-NEXT: movdqu %xmm0, (%eax)
; CHECK-NEXT: retl
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s --check-prefix=SSE
; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
+define <4 x float> @test_x86_sse2_cvtsd2ss_load(<4 x float> %a0, <2 x double>* %p1) {
+; SSE-LABEL: test_x86_sse2_cvtsd2ss_load:
+; SSE: ## BB#0:
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movaps (%eax), %xmm1
+; SSE-NEXT: cvtsd2ss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtsd2ss_load:
+; KNL: ## BB#0:
+; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT: vcvtsd2ss (%eax), %xmm0, %xmm0
+; KNL-NEXT: retl
+ %a1 = load <2 x double>, <2 x double>* %p1
+ %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+
+
define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0, i32 %a1) {
; SSE-LABEL: test_x86_sse2_cvtsi2sd:
; SSE: ## BB#0:
declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
+define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, <4 x float>* %p1) {
+; SSE-LABEL: test_x86_sse2_cvtss2sd_load:
+; SSE: ## BB#0:
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movaps (%eax), %xmm1
+; SSE-NEXT: cvtss2sd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtss2sd_load:
+; KNL: ## BB#0:
+; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT: vcvtss2sd (%eax), %xmm0, %xmm0
+; KNL-NEXT: retl
+ %a1 = load <4 x float>, <4 x float>* %p1
+ %res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+
+
define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
; SSE-LABEL: test_x86_sse2_cvttpd2dq:
; SSE: ## BB#0:
declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
+define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
+; SSE-LABEL: test_x86_sse2_cvttps2dq:
+; SSE: ## BB#0:
+; SSE-NEXT: cvttps2dq %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvttps2dq:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvttps2dq %xmm0, %xmm0
+; KNL-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
+
+
define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
; SSE-LABEL: test_x86_sse2_cvttsd2si:
; SSE: ## BB#0:
ret i1 %b
}
-; TODO: Inexact values should not fold as they are dependent on rounding mode
+; Inexact values should not fold as they are dependent on rounding mode
define i1 @test_sse_cvts_inexact() nounwind readnone {
; CHECK-LABEL: @test_sse_cvts_inexact(
-; CHECK-NOT: call
-; CHECK: ret i1 true
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
entry:
%i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
%i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind