From: Craig Topper Date: Wed, 9 Nov 2016 07:31:32 +0000 (+0000) Subject: [X86] Lower AVX512 and SSE intrinsics for CVTTPD2DQ to X86ISD::CVTTPD2DQ. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=45b3e8f9df5d37f9c539c103f2ce6cc552c0173d;p=llvm [X86] Lower AVX512 and SSE intrinsics for CVTTPD2DQ to X86ISD::CVTTPD2DQ. Summary: This allows the SSE intrinsic to use the EVEX instruction when available. It also fixes EVEX to not use a weird (v4i32 (fp_to_sint v2f64)) node and it merges some isel patterns. This also fixes some cases that weren't combining vzmovl with cvttpd2dq to remove extra moves. Reviewers: delena, zvi, RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D26330 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@286344 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 7fafa207c4c..408393183cb 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -6077,6 +6077,10 @@ def : Pat<(v8f64 (extloadv8f32 addr:$src)), (VCVTPS2PDZrm addr:$src)>; let Predicates = [HasVLX] in { + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128X:$src)))))), + (VCVTPD2PSZ128rr VR128X:$src)>; def : Pat<(v2f64 (extloadv2f32 addr:$src)), (VCVTPS2PDZ128rm addr:$src)>; def : Pat<(v4f64 (extloadv4f32 addr:$src)), @@ -6148,8 +6152,8 @@ multiclass avx512_cvtps2dq opc, string OpcodeStr, } // Convert Double to Signed/Unsigned Doubleword with truncation -multiclass avx512_cvttpd2dq opc, string OpcodeStr, - SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_cvttpd2dq opc, string OpcodeStr, SDNode OpNode, + SDNode OpNode128, SDNode OpNodeRnd> { let Predicates = [HasAVX512] in { defm Z : avx512_vcvt_fp, avx512_vcvt_fp_sae opc, string OpcodeStr, } let Predicates = [HasVLX] in { // we need "x"/"y" suffixes in order to distinguish between 128 and 256 - // memory forms of these instructions in Asm Parcer. They have the same + // memory forms of these instructions in Asm Parser. They have the same // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly // due to the same reason. - defm Z128 : avx512_vcvt_fp, EVEX_V128; + defm Z128 : avx512_vcvt_fp, EVEX_V128; defm Z256 : avx512_vcvt_fp, EVEX_V256; } @@ -6302,7 +6306,7 @@ defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint, X86cvttp2siRnd>, XS, EVEX_CD8<32, CD8VF>; -defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, +defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttpd2dq, X86cvttp2siRnd>, PD, VEX_W, EVEX_CD8<64, CD8VF>; @@ -6310,7 +6314,7 @@ defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint, X86cvttp2uiRnd>, PS, EVEX_CD8<32, CD8VF>; -defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint, +defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint, fp_to_uint, X86cvttp2uiRnd>, PS, VEX_W, EVEX_CD8<64, CD8VF>; @@ -6408,13 +6412,10 @@ def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))), } let Predicates = [HasAVX512, HasVLX] in { - def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttpd2dq (v2f64 VR128X:$src)))))))), + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttpd2dq (v2f64 VR128X:$src)))))), (VCVTTPD2DQZ128rr VR128:$src)>; - def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128X:$src))), - (VCVTTPD2DQZ128rr VR128X:$src)>; - def : Pat<(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))), - (VCVTTPD2DQZ128rm addr:$src)>; } let Predicates = [HasAVX512] in { diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 8e6a6350694..8de5a61beb2 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2065,11 +2065,12 @@ let Predicates = [UseSSE2] in { (CVTTPS2DQrm addr:$src)>; } +let Predicates = [HasAVX, NoVLX] in def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvttpd2dq VR128:$src))], - IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>; + (v4i32 (X86cvttpd2dq (v2f64 VR128:$src))))], + IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>; // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. @@ -2078,10 +2079,11 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), // XMM only def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; +let Predicates = [HasAVX, NoVLX] in def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttpd2dqx\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvttpd2dq - (loadv2f64 addr:$src)))], + [(set VR128:$dst, + (v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))))], IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>; // YMM only @@ -2099,13 +2101,10 @@ def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))))), - (VCVTTPD2DQrr VR128:$src)>; - def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))), + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))), (VCVTTPD2DQrr VR128:$src)>; - def : Pat<(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))), - (VCVTTPD2DQXrm addr:$src)>; def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), (VCVTTPD2DQYrr VR256:$src)>; @@ -2125,8 +2124,9 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), Sched<[WriteCvtF2ILd]>; let Predicates = [UseSSE2] in { - def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))))), + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))), (CVTTPD2DQrr VR128:$src)>; def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))), (CVTTPD2DQrr VR128:$src)>; @@ -2254,8 +2254,9 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), let Predicates = [HasAVX, NoVLX] in { // Match fpround and fpextend for 128/256-bit conversions - def : Pat<(v4f32 (bitconvert (X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128:$src)))))))), + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128:$src)))))), (VCVTPD2PSrr VR128:$src)>; def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), (VCVTPD2PSrr VR128:$src)>; @@ -2272,8 +2273,9 @@ let Predicates = [HasAVX, NoVLX] in { let Predicates = [UseSSE2] in { // Match fpround and fpextend for 128 conversions - def : Pat<(v4f32 (bitconvert (X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128:$src)))))))), + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128:$src)))))), (CVTPD2PSrr VR128:$src)>; def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), (CVTPD2PSrr VR128:$src)>; diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index ca529038d52..f83878b10c8 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -574,7 +574,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VFPEXTS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK, - ISD::FP_TO_SINT, 0), + X86ISD::CVTTPD2DQ, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK, ISD::FP_TO_SINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK, @@ -1636,6 +1636,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE), X86_INTRINSIC_DATA(sse2_cvtdq2ps, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0), X86_INTRINSIC_DATA(sse2_cvtpd2ps, INTR_TYPE_1OP, X86ISD::VFPROUND, 0), + X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTPD2DQ, 0), X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(sse2_movmsk_pd, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index da1df98e45b..16df906635d 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -338,10 +338,15 @@ declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) { -; CHECK-LABEL: test_x86_sse2_cvttpd2dq: -; CHECK: ## BB#0: -; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_sse2_cvttpd2dq: +; AVX: ## BB#0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_sse2_cvttpd2dq: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0] +; AVX512VL-NEXT: retl ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index 79453011f9f..2e5f8a50843 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -324,8 +324,6 @@ define <4 x float> @test_x86_sse2_cvtpd2ps_zext(<2 x double> %a0) nounwind { ; SKX-LABEL: test_x86_sse2_cvtpd2ps_zext: ; SKX: ## BB#0: ; SKX-NEXT: vcvtpd2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x5a,0xc0] -; SKX-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7e,0xc0] -; SKX-NEXT: ## xmm0 = xmm0[0],zero ; SKX-NEXT: retl ## encoding: [0xc3] %cvt = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) %res = shufflevector <4 x float> %cvt, <4 x float> zeroinitializer, <4 x i32> @@ -502,10 +500,15 @@ define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) { ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xe6,0xc0] ; SSE-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse2_cvttpd2dq: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse2_cvttpd2dq: +; AVX2: ## BB#0: +; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_cvttpd2dq: +; SKX: ## BB#0: +; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0] +; SKX-NEXT: retl ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -516,22 +519,16 @@ define <2 x i64> @test_mm_cvttpd_epi32_zext(<2 x double> %a0) nounwind { ; SSE-LABEL: test_mm_cvttpd_epi32_zext: ; SSE: ## BB#0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xe6,0xc0] -; SSE-NEXT: movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0] -; SSE-NEXT: ## xmm0 = xmm0[0],zero ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_mm_cvttpd_epi32_zext: ; AVX2: ## BB#0: ; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0] -; AVX2-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0] -; AVX2-NEXT: ## xmm0 = xmm0[0],zero ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_mm_cvttpd_epi32_zext: ; SKX: ## BB#0: -; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0] -; SKX-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7e,0xc0] -; SKX-NEXT: ## xmm0 = xmm0[0],zero +; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0] ; SKX-NEXT: retl ## encoding: [0xc3] %cvt = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) %res = shufflevector <4 x i32> %cvt, <4 x i32> zeroinitializer, <4 x i32>