[X86] Add DAG combine to turn (vzmovl (insert_subvector undef, X, 0)) into (insert_su...

author Craig Topper <craig.topper@intel.com>

Fri, 21 Jun 2019 19:10:21 +0000 (19:10 +0000)

committer Craig Topper <craig.topper@intel.com>

Fri, 21 Jun 2019 19:10:21 +0000 (19:10 +0000)
author Craig Topper <craig.topper@intel.com>
Fri, 21 Jun 2019 19:10:21 +0000 (19:10 +0000)
committer Craig Topper <craig.topper@intel.com>
Fri, 21 Jun 2019 19:10:21 +0000 (19:10 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index ff0f41eaf3c3b4b867a86a0032c548690d71eca3..b21f8fa25cdc84773f0f88024b481049017958fe 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -33658,6 +33658,22 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
      }
    }
  
+  // Pull subvector inserts into undef through VZEXT_MOVL by making it an
+  // insert into a zero vector. This helps get VZEXT_MOVL closer to
+  // scalar_to_vectors where 256/512 are canonicalized to an insert and a
+  // 128-bit scalar_to_vector. This reduces the number of isel patterns.
+  if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
+      N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
+      N->getOperand(0).hasOneUse() &&
+      N->getOperand(0).getOperand(0).isUndef() &&
+      isNullConstant(N->getOperand(0).getOperand(2))) {
+    SDValue In = N->getOperand(0).getOperand(1);
+    SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
+                       getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
+                       Movl, N->getOperand(0).getOperand(2));
+  }
+
    // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
    // operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
    // FIXME: This can probably go away once we default to widening legalization.
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td

index 8315b867316a95d501091ceffdb4c5f23b2c2964..b7172050b363dd368729e0bbfc266bf79a3f15a2 100644 (file)
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -4329,39 +4329,17 @@ let Predicates = [HasAVX512] in {
  
    // Represent the same patterns above but in the form they appear for
    // 256-bit types
-  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
-                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
-  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
-                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
    def : Pat<(v8f32 (X86vzload addr:$src)),
              (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
-  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
-                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
    def : Pat<(v4f64 (X86vzload addr:$src)),
              (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
  
    // Represent the same patterns above but in the form they appear for
    // 512-bit types
-  def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
-                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
-  def : Pat<(v16f32 (X86vzmovl (insert_subvector undef,
-                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
    def : Pat<(v16f32 (X86vzload addr:$src)),
              (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
-  def : Pat<(v8f64 (X86vzmovl (insert_subvector undef,
-                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
    def : Pat<(v8f64 (X86vzload addr:$src)),
              (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
-
-  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
  }
  
  let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
@@ -4380,14 +4358,6 @@ let Predicates = [HasAVX512] in {
    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
              (VMOV64toPQIZrr GR64:$src)>;
  
-  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
-
-  def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
-                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
-
    // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
              (VMOVDI2PDIZrm addr:$src)>;
@@ -4408,14 +4378,6 @@ let Predicates = [HasAVX512] in {
    def : Pat<(v4i64 (X86vzload addr:$src)),
              (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
  
-  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
-  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
-                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
-  def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
-                                (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
-
    // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
    def : Pat<(v16i32 (X86vzload addr:$src)),
              (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td

index e25d2dca4047ed040e0473b6f5afb91b5ed67261..810beb6f40b97946751250f21b7ce671ed863d76 100644 (file)
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -283,14 +283,8 @@ let Predicates = [UseAVX] in {
  
    // Represent the same patterns above but in the form they appear for
    // 256-bit types
-  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
-                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
    def : Pat<(v8f32 (X86vzload addr:$src)),
              (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
-  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
-                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
    def : Pat<(v4f64 (X86vzload addr:$src)),
              (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
  }
@@ -4145,9 +4139,6 @@ let Predicates = [UseAVX] in {
    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
              (VMOV64toPQIrr GR64:$src)>;
  
-  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-              (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>;
    // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
    // These instructions also write zeros in the high part of a 256-bit register.
    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
@@ -4158,15 +4149,8 @@ let Predicates = [UseAVX] in {
              (VMOVDI2PDIrm addr:$src)>;
    def : Pat<(v4i32 (X86vzload addr:$src)),
              (VMOVDI2PDIrm addr:$src)>;
-  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
-              (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
    def : Pat<(v8i32 (X86vzload addr:$src)),
              (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
-  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
-  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
-                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>;
  }
  
  let Predicates = [UseSSE2] in {
@@ -4253,9 +4237,6 @@ let Predicates = [UseAVX] in {
              (VMOVQI2PQIrm addr:$src)>;
    def : Pat<(v2i64 (X86vzload addr:$src)),
              (VMOVQI2PQIrm addr:$src)>;
-  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-              (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
-            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
    def : Pat<(v4i64 (X86vzload addr:$src)),
              (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
  
diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll

index 7bd39f4d1d379c20b0019b5d6f71a4a8d91a84c7..402e2705191ed7f22af6132bcf58521454f79bb6 100644 (file)
--- a/test/CodeGen/X86/avx-load-store.ll
+++ b/test/CodeGen/X86/avx-load-store.ll
@@ -240,6 +240,7 @@ define void @f_f() nounwind {
  ; CHECK_O0-NEXT:  .LBB9_3: # %cif_mixed_test_all
  ; CHECK_O0-NEXT:    movl $-1, %eax
  ; CHECK_O0-NEXT:    vmovd %eax, %xmm0
+; CHECK_O0-NEXT:    vmovdqa %xmm0, %xmm0
  ; CHECK_O0-NEXT:    vmovaps %xmm0, %xmm1
  ; CHECK_O0-NEXT:    # implicit-def: $rcx
  ; CHECK_O0-NEXT:    # implicit-def: $ymm2
diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll

index 5b3fcb1e0233865cf50a663f24502fdea4a94ef4..af66ba6cd6d386316d7bc539a731767bc1ba97d5 100644 (file)
--- a/test/CodeGen/X86/vec_extract-avx.ll
+++ b/test/CodeGen/X86/vec_extract-avx.ll
@@ -144,17 +144,15 @@ define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) {
  ; X32:       # %bb.0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    vmovdqu (%ecx), %xmm0
-; X32-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X32-NEXT:    vmovdqa %ymm0, (%eax)
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    vmovaps %ymm0, (%eax)
  ; X32-NEXT:    vzeroupper
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: legal_vzmovl_2i64_4i64:
  ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X64-NEXT:    vmovdqa %ymm0, (%rsi)
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    vmovaps %ymm0, (%rsi)
  ; X64-NEXT:    vzeroupper
  ; X64-NEXT:    retq
    %ld = load <2 x i64>, <2 x i64>* %in, align 8
@@ -196,17 +194,15 @@ define void @legal_vzmovl_2f64_4f64(<2 x double>* %in, <4 x double>* %out) {
  ; X32:       # %bb.0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    vmovdqu (%ecx), %xmm0
-; X32-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X32-NEXT:    vmovdqa %ymm0, (%eax)
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    vmovaps %ymm0, (%eax)
  ; X32-NEXT:    vzeroupper
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: legal_vzmovl_2f64_4f64:
  ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X64-NEXT:    vmovdqa %ymm0, (%rsi)
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    vmovaps %ymm0, (%rsi)
  ; X64-NEXT:    vzeroupper
  ; X64-NEXT:    retq
    %ld = load <2 x double>, <2 x double>* %in, align 8
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll

index 33513fef481ada75daedcc80530a7a8ba95ef7f6..3717d05e9e5b7eb17e5fa2f7828161ad597d59cc 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1514,7 +1514,6 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
  define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
  ; ALL-LABEL: insert_reg_and_zero_v4f64:
  ; ALL:       # %bb.0:
-; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
  ; ALL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
  ; ALL-NEXT:    retq
    %v = insertelement <4 x double> undef, double %a, i32 0
author	Craig Topper <craig.topper@intel.com>
	Fri, 21 Jun 2019 19:10:21 +0000 (19:10 +0000)
committer	Craig Topper <craig.topper@intel.com>
	Fri, 21 Jun 2019 19:10:21 +0000 (19:10 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86InstrAVX512.td		patch \| blob \| history
lib/Target/X86/X86InstrSSE.td		patch \| blob \| history
test/CodeGen/X86/avx-load-store.ll		patch \| blob \| history
test/CodeGen/X86/vec_extract-avx.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-256-v4.ll		patch \| blob \| history