[X86] Make masked floating point equality/ordered compares commutable for load foldin...

author Craig Topper <craig.topper@intel.com>

Thu, 6 Jun 2019 16:39:04 +0000 (16:39 +0000)

committer Craig Topper <craig.topper@intel.com>

Thu, 6 Jun 2019 16:39:04 +0000 (16:39 +0000)
author Craig Topper <craig.topper@intel.com>
Thu, 6 Jun 2019 16:39:04 +0000 (16:39 +0000)
committer Craig Topper <craig.topper@intel.com>
Thu, 6 Jun 2019 16:39:04 +0000 (16:39 +0000)
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td

index 4302b3e1ed5594831799e6ac51c9dc3928504a7d..8987b6a33bf8b42a1097a4e730062f57b32f95bc 100644 (file)
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -365,7 +365,7 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
                                    list<dag> Pattern,
                                    list<dag> MaskingPattern,
                                    bit IsCommutable = 0> {
-    let isCommutable = IsCommutable in
+    let isCommutable = IsCommutable in {
      def NAME: AVX512<O, F, Outs, Ins,
                         OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
                                       "$dst, "#IntelSrcAsm#"}",
@@ -375,6 +375,7 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
                         OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
                                       "$dst {${mask}}, "#IntelSrcAsm#"}",
                         MaskingPattern>, EVEX_K;
+    }
  }
  
  multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
@@ -5788,11 +5789,10 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr,
    // NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG.
    // There are just too many permuations due to commutability and bitcasts.
    let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
-  let isCommutable = 1 in
    defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
                     (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (null_frag), (null_frag)>,
+                   (null_frag), (null_frag), 1>,
                     EVEX_4V, Sched<[sched]>;
    let mayLoad = 1 in
    defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp

index 53b23c75737eee6168b2337cc9993c8f9e469ba9..8ac038329205a4cca76002b494b6fbcfefe1feda 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -1837,18 +1837,28 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
    case X86::VCMPPDZ128rri:
    case X86::VCMPPSZ128rri:
    case X86::VCMPPDZ256rri:
-  case X86::VCMPPSZ256rri: {
+  case X86::VCMPPSZ256rri:
+  case X86::VCMPPDZrrik:
+  case X86::VCMPPSZrrik:
+  case X86::VCMPPDZ128rrik:
+  case X86::VCMPPSZ128rrik:
+  case X86::VCMPPDZ256rrik:
+  case X86::VCMPPSZ256rrik: {
+    unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
+
      // Float comparison can be safely commuted for
      // Ordered/Unordered/Equal/NotEqual tests
-    unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+    unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
      switch (Imm) {
      case 0x00: // EQUAL
      case 0x03: // UNORDERED
      case 0x04: // NOT EQUAL
      case 0x07: // ORDERED
-      // The indices of the commutable operands are 1 and 2.
+      // The indices of the commutable operands are 1 and 2 (or 2 and 3
+      // when masked).
        // Assign them to the returned operand indices here.
-      return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
+      return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
+                                  2 + OpOffset);
      }
      return false;
    }
diff --git a/test/CodeGen/X86/stack-folding-fp-avx512.ll b/test/CodeGen/X86/stack-folding-fp-avx512.ll

index f3da45d5e301d002c3debe9ad515bc8a3f88a170..def0fe943efa77954f6fb0ad0c036162df89aebe 100644 (file)
--- a/test/CodeGen/X86/stack-folding-fp-avx512.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx512.ll
@@ -146,6 +146,34 @@ define i8 @stack_fold_cmppd(<8 x double> %a0, <8 x double> %a1) {
  }
  declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
  
+define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) {
+  ;CHECK-LABEL: stack_fold_cmppd_mask:
+  ;CHECK:       vcmpeqpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
+  %2 = load <8 x double>, <8 x double>* %a2
+  %3 = fadd <8 x double> %a1, %2
+  %4 = bitcast i8 %mask to <8 x i1>
+  %5 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %3, <8 x double> %a0, i32 0, i32 4)
+  %6 = and <8 x i1> %4, %5
+  %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
+  ret <8 x double> %7
+}
+
+define <8 x double> @stack_fold_cmppd_mask_commuted(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) {
+  ;CHECK-LABEL: stack_fold_cmppd_mask_commuted:
+  ;CHECK:       vcmpeqpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
+  %2 = load <8 x double>, <8 x double>* %a2
+  %3 = fadd <8 x double> %a1, %2
+  %4 = bitcast i8 %mask to <8 x i1>
+  %5 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a0, <8 x double> %3, i32 0, i32 4)
+  %6 = and <8 x i1> %4, %5
+  %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
+  ret <8 x double> %7
+}
+
  define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) {
    ;CHECK-LABEL: stack_fold_cmpps
    ;CHECK:       vcmpeqps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 64-byte Folded Reload
@@ -156,6 +184,34 @@ define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) {
  }
  declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
  
+define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) {
+  ;CHECK-LABEL: stack_fold_cmpps_mask:
+  ;CHECK:       vcmpeqps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
+  %2 = load <16 x float>, <16 x float>* %a2
+  %3 = fadd <16 x float> %a1, %2
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %3, <16 x float> %a0, i32 0, i32 4)
+  %6 = and <16 x i1> %4, %5
+  %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
+  ret <16 x float> %7
+}
+
+define <16 x float> @stack_fold_cmpps_mask_commuted(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) {
+  ;CHECK-LABEL: stack_fold_cmpps_mask_commuted:
+  ;CHECK:       vcmpeqps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
+  %2 = load <16 x float>, <16 x float>* %a2
+  %3 = fadd <16 x float> %a1, %2
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a0, <16 x float> %3, i32 0, i32 4)
+  %6 = and <16 x i1> %4, %5
+  %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
+  ret <16 x float> %7
+}
+
  define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
    ;CHECK-LABEL: stack_fold_divsd_int
    ;CHECK:       vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
author	Craig Topper <craig.topper@intel.com>
	Thu, 6 Jun 2019 16:39:04 +0000 (16:39 +0000)
committer	Craig Topper <craig.topper@intel.com>
	Thu, 6 Jun 2019 16:39:04 +0000 (16:39 +0000)
lib/Target/X86/X86InstrAVX512.td		patch \| blob \| history
lib/Target/X86/X86InstrInfo.cpp		patch \| blob \| history
test/CodeGen/X86/stack-folding-fp-avx512.ll		patch \| blob \| history