From: Craig Topper <craig.topper@gmail.com>
Date: Wed, 9 Nov 2016 07:31:32 +0000 (+0000)
Subject: [X86] Lower AVX512 and SSE intrinsics for CVTTPD2DQ to X86ISD::CVTTPD2DQ.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=45b3e8f9df5d37f9c539c103f2ce6cc552c0173d;p=llvm

[X86] Lower AVX512 and SSE intrinsics for CVTTPD2DQ to X86ISD::CVTTPD2DQ.

Summary: This allows the SSE intrinsic to use the EVEX instruction when available. It also fixes EVEX to not use a weird (v4i32 (fp_to_sint v2f64)) node and it merges some isel patterns. This also fixes some cases that weren't combining vzmovl with cvttpd2dq to remove extra moves.

Reviewers: delena, zvi, RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D26330

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@286344 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 7fafa207c4c..408393183cb 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -6077,6 +6077,10 @@ def : Pat<(v8f64 (extloadv8f32 addr:$src)),
             (VCVTPS2PDZrm addr:$src)>;
 
 let Predicates = [HasVLX] in {
+  let AddedComplexity = 15 in
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                               (v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
+            (VCVTPD2PSZ128rr VR128X:$src)>;
   def : Pat<(v2f64 (extloadv2f32 addr:$src)),
               (VCVTPS2PDZ128rm addr:$src)>;
   def : Pat<(v4f64 (extloadv4f32 addr:$src)),
@@ -6148,8 +6152,8 @@ multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr,
 }
 
 // Convert Double to Signed/Unsigned Doubleword with truncation
-multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr,
-                                  SDNode OpNode, SDNode OpNodeRnd> {
+multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            SDNode OpNode128, SDNode OpNodeRnd> {
   let Predicates = [HasAVX512] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
@@ -6157,11 +6161,11 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr,
   }
   let Predicates = [HasVLX] in {
     // we need "x"/"y" suffixes in order to distinguish between 128 and 256
-    // memory forms of these instructions in Asm Parcer. They have the same
+    // memory forms of these instructions in Asm Parser. They have the same
     // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
     // due to the same reason.
-    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
-                               "{1to2}", "{x}">, EVEX_V128;
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
+                               OpNode128, "{1to2}", "{x}">, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
                                "{1to4}", "{y}">, EVEX_V256;
   }
@@ -6302,7 +6306,7 @@ defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
                                 X86cvttp2siRnd>,
                                 XS, EVEX_CD8<32, CD8VF>;
 
-defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint,
+defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttpd2dq,
                                  X86cvttp2siRnd>,
                                  PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
@@ -6310,7 +6314,7 @@ defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
                                  X86cvttp2uiRnd>, PS,
                                  EVEX_CD8<32, CD8VF>;
 
-defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
+defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint, fp_to_uint,
                                  X86cvttp2uiRnd>, PS, VEX_W,
                                  EVEX_CD8<64, CD8VF>;
 
@@ -6408,13 +6412,10 @@ def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
 }
 
 let Predicates = [HasAVX512, HasVLX] in {
-  def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
-                               (v4i32 (X86cvttpd2dq (v2f64 VR128X:$src)))))))),
+  let AddedComplexity = 15 in
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                               (v4i32 (X86cvttpd2dq (v2f64 VR128X:$src)))))),
             (VCVTTPD2DQZ128rr VR128:$src)>;
-  def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128X:$src))),
-            (VCVTTPD2DQZ128rr VR128X:$src)>;
-  def : Pat<(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))),
-            (VCVTTPD2DQZ128rm addr:$src)>;
 }
 
 let Predicates = [HasAVX512] in {
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 8e6a6350694..8de5a61beb2 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2065,11 +2065,12 @@ let Predicates = [UseSSE2] in {
             (CVTTPS2DQrm addr:$src)>;
 }
 
+let Predicates = [HasAVX, NoVLX] in
 def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                              (int_x86_sse2_cvttpd2dq VR128:$src))],
-                              IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
+                          (v4i32 (X86cvttpd2dq (v2f64 VR128:$src))))],
+                        IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
 
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
@@ -2078,10 +2079,11 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 // XMM only
 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
+let Predicates = [HasAVX, NoVLX] in
 def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttpd2dqx\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
-                                            (loadv2f64 addr:$src)))],
+                         [(set VR128:$dst,
+                           (v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))))],
                          IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 
 // YMM only
@@ -2099,13 +2101,10 @@ def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 
 let Predicates = [HasAVX, NoVLX] in {
-  def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
-                               (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))))),
-            (VCVTTPD2DQrr VR128:$src)>;
-  def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))),
+  let AddedComplexity = 15 in
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                               (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))),
             (VCVTTPD2DQrr VR128:$src)>;
-  def : Pat<(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))),
-            (VCVTTPD2DQXrm addr:$src)>;
 
   def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
             (VCVTTPD2DQYrr VR256:$src)>;
@@ -2125,8 +2124,9 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
                       Sched<[WriteCvtF2ILd]>;
 
 let Predicates = [UseSSE2] in {
-  def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
-                               (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))))),
+  let AddedComplexity = 15 in
+  def : Pat<(X86vzmovl (v2i64 (bitconvert
+                               (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))),
             (CVTTPD2DQrr VR128:$src)>;
   def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))),
             (CVTTPD2DQrr VR128:$src)>;
@@ -2254,8 +2254,9 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 
 let Predicates = [HasAVX, NoVLX] in {
   // Match fpround and fpextend for 128/256-bit conversions
-  def : Pat<(v4f32 (bitconvert (X86vzmovl (v2f64 (bitconvert
-                               (v4f32 (X86vfpround (v2f64 VR128:$src)))))))),
+  let AddedComplexity = 15 in
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                               (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
             (VCVTPD2PSrr VR128:$src)>;
   def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
             (VCVTPD2PSrr VR128:$src)>;
@@ -2272,8 +2273,9 @@ let Predicates = [HasAVX, NoVLX] in {
 
 let Predicates = [UseSSE2] in {
   // Match fpround and fpextend for 128 conversions
-  def : Pat<(v4f32 (bitconvert (X86vzmovl (v2f64 (bitconvert
-                               (v4f32 (X86vfpround (v2f64 VR128:$src)))))))),
+  let AddedComplexity = 15 in
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                               (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
             (CVTPD2PSrr VR128:$src)>;
   def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
             (CVTPD2PSrr VR128:$src)>;
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index ca529038d52..f83878b10c8 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -574,7 +574,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::VFPEXTS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK,
-                     ISD::FP_TO_SINT, 0),
+                     X86ISD::CVTTPD2DQ, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK,
                      ISD::FP_TO_SINT, 0),
   X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
@@ -1636,6 +1636,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_comineq_sd,   COMI, X86ISD::COMI, ISD::SETNE),
   X86_INTRINSIC_DATA(sse2_cvtdq2ps,     INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
   X86_INTRINSIC_DATA(sse2_cvtpd2ps,     INTR_TYPE_1OP, X86ISD::VFPROUND, 0),
+  X86_INTRINSIC_DATA(sse2_cvttpd2dq,    INTR_TYPE_1OP, X86ISD::CVTTPD2DQ, 0),
   X86_INTRINSIC_DATA(sse2_max_pd,       INTR_TYPE_2OP, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(sse2_min_pd,       INTR_TYPE_2OP, X86ISD::FMIN, 0),
   X86_INTRINSIC_DATA(sse2_movmsk_pd,    INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index da1df98e45b..16df906635d 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -338,10 +338,15 @@ declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind
 
 
 define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvttpd2dq:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse2_cvttpd2dq:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
+; AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_sse2_cvttpd2dq:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0]
+; AVX512VL-NEXT:    retl ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 79453011f9f..2e5f8a50843 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -324,8 +324,6 @@ define <4 x float> @test_x86_sse2_cvtpd2ps_zext(<2 x double> %a0) nounwind {
 ; SKX-LABEL: test_x86_sse2_cvtpd2ps_zext:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcvtpd2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x5a,0xc0]
-; SKX-NEXT:    vmovq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7e,0xc0]
-; SKX-NEXT:    ## xmm0 = xmm0[0],zero
 ; SKX-NEXT:    retl ## encoding: [0xc3]
   %cvt = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
   %res = shufflevector <4 x float> %cvt, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -502,10 +500,15 @@ define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xe6,0xc0]
 ; SSE-NEXT:    retl ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse2_cvttpd2dq:
-; VCHECK:       ## BB#0:
-; VCHECK-NEXT:    vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse2_cvttpd2dq:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_cvttpd2dq:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0]
+; SKX-NEXT:    retl ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -516,22 +519,16 @@ define <2 x i64> @test_mm_cvttpd_epi32_zext(<2 x double> %a0) nounwind {
 ; SSE-LABEL: test_mm_cvttpd_epi32_zext:
 ; SSE:       ## BB#0:
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xe6,0xc0]
-; SSE-NEXT:    movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0]
-; SSE-NEXT:    ## xmm0 = xmm0[0],zero
 ; SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; AVX2-LABEL: test_mm_cvttpd_epi32_zext:
 ; AVX2:       ## BB#0:
 ; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
-; AVX2-NEXT:    vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0]
-; AVX2-NEXT:    ## xmm0 = xmm0[0],zero
 ; AVX2-NEXT:    retl ## encoding: [0xc3]
 ;
 ; SKX-LABEL: test_mm_cvttpd_epi32_zext:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
-; SKX-NEXT:    vmovq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7e,0xc0]
-; SKX-NEXT:    ## xmm0 = xmm0[0],zero
+; SKX-NEXT:    vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0]
 ; SKX-NEXT:    retl ## encoding: [0xc3]
   %cvt = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
   %res = shufflevector <4 x i32> %cvt, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>