From 3ccd12e35358d21b8784c0d1c2e78dc433793d92 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 30 Aug 2017 04:34:48 +0000
Subject: [PATCH] [X86] Provide a separate feature bit for macro fusion support
 instead of basing it on the AVX flag

Summary:
Currently we determine if macro fusion is supported based on the AVX flag as a proxy for the processor being Sandy Bridge".

This is really strange as now AMD supports AVX. It also means if user explicitly disables AVX we disable macro fusion.

This patch adds an explicit macro fusion feature. I've also enabled for the generic 64-bit CPU (which doesn't have AVX)

This is probably another candidate for being in the MI layer, but for now I at least wanted to correct the overloading of the AVX feature.

Reviewers: spatel, chandlerc, RKSimon, zvi

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D37280

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@312097 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86.td                  | 38 +++++++++++++++++++-------
 lib/Target/X86/X86MacroFusion.cpp      |  6 ++--
 lib/Target/X86/X86Subtarget.cpp        |  1 +
 lib/Target/X86/X86Subtarget.h          |  4 +++
 test/CodeGen/X86/avx-select.ll         |  4 +--
 test/CodeGen/X86/avx-splat.ll          |  2 +-
 test/CodeGen/X86/avx512-mask-op.ll     |  4 +--
 test/CodeGen/X86/vec_int_to_fp.ll      |  4 +--
 test/CodeGen/X86/x86-cmov-converter.ll |  2 +-
 9 files changed, 43 insertions(+), 22 deletions(-)
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 37a7cdd779d..888af176a86 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -288,6 +288,13 @@ def FeatureERMSB
           "ermsb", "HasERMSB", "true",
           "REP MOVS/STOS are fast">;
 
+// Sandy Bridge and newer processors have many instructions that can be
+// fused with conditional branches and pass through the CPU as a single
+// operation.
+def FeatureMacroFusion
+    : SubtargetFeature<"macrofusion", "HasMacroFusion", "true",
+                 "Various instructions can be fused with conditional branches">;
+
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
 //===----------------------------------------------------------------------===//
@@ -372,7 +379,8 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
   FeatureFXSR,
   FeatureCMPXCHG16B,
   FeatureSlowBTMem,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureMacroFusion
 ]>;
 def : ProcessorModel<"penryn", SandyBridgeModel, [
   FeatureX87,
@@ -382,7 +390,8 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
   FeatureFXSR,
   FeatureCMPXCHG16B,
   FeatureSlowBTMem,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureMacroFusion
 ]>;
 
 // Atom CPUs.
@@ -468,7 +477,8 @@ class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureCMPXCHG16B,
   FeatureSlowBTMem,
   FeaturePOPCNT,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureMacroFusion
 ]>;
 def : NehalemProc<"nehalem">;
 def : NehalemProc<"corei7">;
@@ -485,7 +495,8 @@ class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeaturePOPCNT,
   FeatureAES,
   FeaturePCLMUL,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureMacroFusion
 ]>;
 def : WestmereProc<"westmere">;
 
@@ -516,7 +527,8 @@ def SNBFeatures : ProcessorFeatures<[], [
   FeatureLAHFSAHF,
   FeatureSlow3OpsLEA,
   FeatureFastScalarFSQRT,
-  FeatureFastSHLDRotate
+  FeatureFastSHLDRotate,
+  FeatureMacroFusion
 ]>;
 
 class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
@@ -731,7 +743,8 @@ def : Proc<"bdver1", [
   FeatureXSAVE,
   FeatureLWP,
   FeatureSlowSHLD,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureMacroFusion
 ]>;
 // Piledriver
 def : Proc<"bdver2", [
@@ -755,7 +768,8 @@ def : Proc<"bdver2", [
   FeatureLWP,
   FeatureFMA,
   FeatureSlowSHLD,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureMacroFusion
 ]>;
 
 // Steamroller
@@ -782,7 +796,8 @@ def : Proc<"bdver3", [
   FeatureXSAVEOPT,
   FeatureSlowSHLD,
   FeatureFSGSBase,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureMacroFusion
 ]>;
 
 // Excavator
@@ -810,7 +825,8 @@ def : Proc<"bdver4", [
   FeatureSlowSHLD,
   FeatureFSGSBase,
   FeatureLAHFSAHF,
-  FeatureMWAITX
+  FeatureMWAITX,
+  FeatureMacroFusion
 ]>;
 
 // Znver1
@@ -830,6 +846,7 @@ def: ProcessorModel<"znver1", Znver1Model, [
   FeatureFastLZCNT,
   FeatureLAHFSAHF,
   FeatureLZCNT,
+  FeatureMacroFusion,
   FeatureMMX,
   FeatureMOVBE,
   FeatureMWAITX,
@@ -873,7 +890,8 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [
   Feature64Bit,
   FeatureSlow3OpsLEA,
   FeatureSlowBTMem,
-  FeatureSlowIncDec
+  FeatureSlowIncDec,
+  FeatureMacroFusion
 ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index 8fdf1061705..d3ef7aa8d6c 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -27,10 +27,8 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
                                    const MachineInstr *FirstMI,
                                    const MachineInstr &SecondMI) {
   const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI);
-  // Check if this processor supports macro-fusion. Since this is a minor
-  // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
-  // proxy for SandyBridge+.
-  if (!ST.hasAVX())
+  // Check if this processor supports macro-fusion.
+  if (!ST.hasMacroFusion())
     return false;
 
   enum {
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 6ad6da95d7b..2a7733996c4 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -347,6 +347,7 @@ void X86Subtarget::initializeEnvironment() {
   HasFastVectorFSQRT = false;
   HasFastLZCNT = false;
   HasFastSHLDRotate = false;
+  HasMacroFusion = false;
   HasERMSB = false;
   HasSlowDivide32 = false;
   HasSlowDivide64 = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 8b869022d76..7c85e9c2eee 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -238,6 +238,9 @@ protected:
   /// True if SHLD based rotate is fast.
   bool HasFastSHLDRotate;
 
+  /// True if the processor supports macrofusion.
+  bool HasMacroFusion;
+
   /// True if the processor has enhanced REP MOVSB/STOSB.
   bool HasERMSB;
 
@@ -488,6 +491,7 @@ public:
   bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
   bool hasFastLZCNT() const { return HasFastLZCNT; }
   bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
+  bool hasMacroFusion() const { return HasMacroFusion; }
   bool hasERMSB() const { return HasERMSB; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
diff --git a/test/CodeGen/X86/avx-select.ll b/test/CodeGen/X86/avx-select.ll
index 7484f8257ca..f5ab0cab17f 100644
--- a/test/CodeGen/X86/avx-select.ll
+++ b/test/CodeGen/X86/avx-select.ll
@@ -16,8 +16,8 @@ define <8 x i32> @select00(i32 %a, <8 x i32> %b) nounwind {
 ;
 ; X64-LABEL: select00:
 ; X64:       # BB#0:
-; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    cmpl $255, %edi
+; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    je .LBB0_2
 ; X64-NEXT:  # BB#1:
 ; X64-NEXT:    vmovaps %ymm0, %ymm1
@@ -44,8 +44,8 @@ define <4 x i64> @select01(i32 %a, <4 x i64> %b) nounwind {
 ;
 ; X64-LABEL: select01:
 ; X64:       # BB#0:
-; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    cmpl $255, %edi
+; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    je .LBB1_2
 ; X64-NEXT:  # BB#1:
 ; X64-NEXT:    vmovaps %ymm0, %ymm1
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 91d1f64c670..0f3f3e5fb6e 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -60,8 +60,8 @@ define <8 x float> @funcE() nounwind {
 ; CHECK-LABEL: funcE:
 ; CHECK:       # BB#0: # %for_exit499
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    # implicit-def: %YMM0
 ; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    # implicit-def: %YMM0
 ; CHECK-NEXT:    jne .LBB4_2
 ; CHECK-NEXT:  # BB#1: # %load.i1247
 ; CHECK-NEXT:    pushq %rbp
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index f6d752ddc3c..77a2a021416 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -692,8 +692,8 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
 ;
 ; AVX512BW-LABEL: test8:
 ; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512BW-NEXT:    cmpl %esi, %edi
+; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512BW-NEXT:    jg LBB17_1
 ; AVX512BW-NEXT:  ## BB#2:
 ; AVX512BW-NEXT:    vpcmpltud %zmm2, %zmm1, %k0
@@ -708,8 +708,8 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
 ;
 ; AVX512DQ-LABEL: test8:
 ; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    cmpl %esi, %edi
+; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    jg LBB17_1
 ; AVX512DQ-NEXT:  ## BB#2:
 ; AVX512DQ-NEXT:    vpcmpltud %zmm2, %zmm1, %k0
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 7cb1c95cb01..3e36969f879 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -1678,8 +1678,8 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
 ; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
 ; VEX-NEXT:  .LBB39_6:
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; VEX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; VEX-NEXT:    js .LBB39_8
 ; VEX-NEXT:  # BB#7:
 ; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
@@ -1914,8 +1914,8 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
 ; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
 ; VEX-NEXT:  .LBB41_6:
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; VEX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; VEX-NEXT:    js .LBB41_8
 ; VEX-NEXT:  # BB#7:
 ; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
diff --git a/test/CodeGen/X86/x86-cmov-converter.ll b/test/CodeGen/X86/x86-cmov-converter.ll
index cdb8894bfd9..5fec1380e14 100644
--- a/test/CodeGen/X86/x86-cmov-converter.ll
+++ b/test/CodeGen/X86/x86-cmov-converter.ll
@@ -296,9 +296,9 @@ while.end:                                        ; preds = %while.body, %entry
 ; CHECK-LABEL: Transform
 ; CHECK-NOT: cmov
 ; CHECK:         divl    [[a:%[0-9a-z]*]]
-; CHECK:         cmpl    [[a]], %eax
 ; CHECK:         movl    $11, [[s1:%[0-9a-z]*]]
 ; CHECK:         movl    [[a]], [[s2:%[0-9a-z]*]]
+; CHECK:         cmpl    [[a]], %edx
 ; CHECK:         ja      [[SinkBB:.*]]
 ; CHECK: [[FalseBB:.*]]:
 ; CHECK:         movl    $22, [[s1]]
-- 
2.50.1