[X86] Tune bypassing of slow division for Intel CPUs

author Nikolai Bozhenov <nikolai.bozhenov@intel.com>

Thu, 12 Jan 2017 19:34:15 +0000 (19:34 +0000)

committer Nikolai Bozhenov <nikolai.bozhenov@intel.com>

Thu, 12 Jan 2017 19:34:15 +0000 (19:34 +0000)
author Nikolai Bozhenov <nikolai.bozhenov@intel.com>
Thu, 12 Jan 2017 19:34:15 +0000 (19:34 +0000)
committer Nikolai Bozhenov <nikolai.bozhenov@intel.com>
Thu, 12 Jan 2017 19:34:15 +0000 (19:34 +0000)
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td

index f95022077d356d605f6b5fad0c14868a4271cd27..83a23d4ad680ecadffe229b8b08e8d2870d5bfea 100644 (file)
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -209,9 +209,9 @@ def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
  def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
                                       "HasSlowDivide32", "true",
                                       "Use 8-bit divide for positive values less than 256">;
-def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
+def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
                                       "HasSlowDivide64", "true",
-                                     "Use 16-bit divide for positive values less than 65536">;
+                                     "Use 32-bit divide for positive values less than 2^32">;
  def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
                                       "PadShortFunctions", "true",
                                       "Pad short functions">;
@@ -461,6 +461,7 @@ def SNBFeatures : ProcessorFeatures<[], [
    FeatureCMPXCHG16B,
    FeaturePOPCNT,
    FeatureAES,
+  FeatureSlowDivide64,
    FeaturePCLMUL,
    FeatureXSAVE,
    FeatureXSAVEOPT,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 05057dabefd2950773f84fe5cf864bd090f50b7d..d2535a83091336aa425a1eee014fe5840cfb21ef 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -97,12 +97,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
    setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
  
-  // Bypass expensive divides on Atom when compiling with O2.
+  // Bypass expensive divides and use cheaper ones.
    if (TM.getOptLevel() >= CodeGenOpt::Default) {
      if (Subtarget.hasSlowDivide32())
        addBypassSlowDiv(32, 8);
      if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
-      addBypassSlowDiv(64, 16);
+      addBypassSlowDiv(64, 32);
    }
  
    if (Subtarget.isTargetKnownWindowsMSVC() ||
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h

index 92c16214aa4a7ab4211f420433ed78d0408609f4..d80dc4a9b5e805012f7ca04562b95176fdc48f16 100644 (file)
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -216,7 +216,7 @@ protected:
    /// 32-bit divisions and should be used when possible.
    bool HasSlowDivide32;
  
-  /// True if 16-bit divides are significantly faster than
+  /// True if 32-bit divides are significantly faster than
    /// 64-bit divisions and should be used when possible.
    bool HasSlowDivide64;
  
diff --git a/test/CodeGen/X86/atom-bypass-slow-division-64.ll b/test/CodeGen/X86/atom-bypass-slow-division-64.ll

index 7ad66ddadf01f715a0e7f086c046c7066940a483..5e90c9e985db956428eaaba1a038fee4d0537c47 100644 (file)
--- a/test/CodeGen/X86/atom-bypass-slow-division-64.ll
+++ b/test/CodeGen/X86/atom-bypass-slow-division-64.ll
@@ -1,5 +1,6 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc < %s -mcpu=atom -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -mcpu=sandybridge -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=SNB
  
  ; Additional tests for 64-bit divide bypass
  
@@ -7,8 +8,9 @@ define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
  ; CHECK-LABEL: Test_get_quotient:
  ; CHECK:       # BB#0:
  ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
  ; CHECK-NEXT:    orq %rsi, %rax
-; CHECK-NEXT:    testq $-65536, %rax # imm = 0xFFFF0000
+; CHECK-NEXT:    testq %rcx, %rax
  ; CHECK-NEXT:    je .LBB0_1
  ; CHECK-NEXT:  # BB#2:
  ; CHECK-NEXT:    movq %rdi, %rax
@@ -18,9 +20,28 @@ define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
  ; CHECK-NEXT:  .LBB0_1:
  ; CHECK-NEXT:    xorl %edx, %edx
  ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    divw %si
-; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    divl %esi
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<def>
  ; CHECK-NEXT:    retq
+;
+; SNB-LABEL: Test_get_quotient:
+; SNB:       # BB#0:
+; SNB-NEXT:    movq %rdi, %rax
+; SNB-NEXT:    orq %rsi, %rax
+; SNB-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
+; SNB-NEXT:    testq %rcx, %rax
+; SNB-NEXT:    je .LBB0_1
+; SNB-NEXT:  # BB#2:
+; SNB-NEXT:    movq %rdi, %rax
+; SNB-NEXT:    cqto
+; SNB-NEXT:    idivq %rsi
+; SNB-NEXT:    retq
+; SNB-NEXT:  .LBB0_1:
+; SNB-NEXT:    xorl %edx, %edx
+; SNB-NEXT:    movl %edi, %eax
+; SNB-NEXT:    divl %esi
+; SNB-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<def>
+; SNB-NEXT:    retq
    %result = sdiv i64 %a, %b
    ret i64 %result
  }
@@ -29,8 +50,9 @@ define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
  ; CHECK-LABEL: Test_get_remainder:
  ; CHECK:       # BB#0:
  ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
  ; CHECK-NEXT:    orq %rsi, %rax
-; CHECK-NEXT:    testq $-65536, %rax # imm = 0xFFFF0000
+; CHECK-NEXT:    testq %rcx, %rax
  ; CHECK-NEXT:    je .LBB1_1
  ; CHECK-NEXT:  # BB#2:
  ; CHECK-NEXT:    movq %rdi, %rax
@@ -41,9 +63,31 @@ define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
  ; CHECK-NEXT:  .LBB1_1:
  ; CHECK-NEXT:    xorl %edx, %edx
  ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    divw %si
-; CHECK-NEXT:    movzwl %dx, %eax
+; CHECK-NEXT:    divl %esi
+; CHECK-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; CHECK-NEXT:    movq %rdx, %rax
  ; CHECK-NEXT:    retq
+;
+; SNB-LABEL: Test_get_remainder:
+; SNB:       # BB#0:
+; SNB-NEXT:    movq %rdi, %rax
+; SNB-NEXT:    orq %rsi, %rax
+; SNB-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
+; SNB-NEXT:    testq %rcx, %rax
+; SNB-NEXT:    je .LBB1_1
+; SNB-NEXT:  # BB#2:
+; SNB-NEXT:    movq %rdi, %rax
+; SNB-NEXT:    cqto
+; SNB-NEXT:    idivq %rsi
+; SNB-NEXT:    movq %rdx, %rax
+; SNB-NEXT:    retq
+; SNB-NEXT:  .LBB1_1:
+; SNB-NEXT:    xorl %edx, %edx
+; SNB-NEXT:    movl %edi, %eax
+; SNB-NEXT:    divl %esi
+; SNB-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SNB-NEXT:    movq %rdx, %rax
+; SNB-NEXT:    retq
    %result = srem i64 %a, %b
    ret i64 %result
  }
@@ -52,8 +96,9 @@ define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
  ; CHECK-LABEL: Test_get_quotient_and_remainder:
  ; CHECK:       # BB#0:
  ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
  ; CHECK-NEXT:    orq %rsi, %rax
-; CHECK-NEXT:    testq $-65536, %rax # imm = 0xFFFF0000
+; CHECK-NEXT:    testq %rcx, %rax
  ; CHECK-NEXT:    je .LBB2_1
  ; CHECK-NEXT:  # BB#2:
  ; CHECK-NEXT:    movq %rdi, %rax
@@ -64,11 +109,33 @@ define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
  ; CHECK-NEXT:  .LBB2_1:
  ; CHECK-NEXT:    xorl %edx, %edx
  ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    divw %si
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    movzwl %dx, %edx
+; CHECK-NEXT:    divl %esi
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<def>
+; CHECK-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
  ; CHECK-NEXT:    addq %rdx, %rax
  ; CHECK-NEXT:    retq
+;
+; SNB-LABEL: Test_get_quotient_and_remainder:
+; SNB:       # BB#0:
+; SNB-NEXT:    movq %rdi, %rax
+; SNB-NEXT:    orq %rsi, %rax
+; SNB-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
+; SNB-NEXT:    testq %rcx, %rax
+; SNB-NEXT:    je .LBB2_1
+; SNB-NEXT:  # BB#2:
+; SNB-NEXT:    movq %rdi, %rax
+; SNB-NEXT:    cqto
+; SNB-NEXT:    idivq %rsi
+; SNB-NEXT:    addq %rdx, %rax
+; SNB-NEXT:    retq
+; SNB-NEXT:  .LBB2_1:
+; SNB-NEXT:    xorl %edx, %edx
+; SNB-NEXT:    movl %edi, %eax
+; SNB-NEXT:    divl %esi
+; SNB-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SNB-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<def>
+; SNB-NEXT:    addq %rdx, %rax
+; SNB-NEXT:    retq
    %resultdiv = sdiv i64 %a, %b
    %resultrem = srem i64 %a, %b
    %result = add i64 %resultdiv, %resultrem
diff --git a/test/CodeGen/X86/slow-div.ll b/test/CodeGen/X86/slow-div.ll

index 82928521ac2b28c0ddbe07d3951aa736a9473fa2..5d58a07d423f35a6f7f19b31075252b934a7350e 100644 (file)
--- a/test/CodeGen/X86/slow-div.ll
+++ b/test/CodeGen/X86/slow-div.ll
@@ -1,5 +1,5 @@
  ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivl-to-divb < %s | FileCheck -check-prefix=DIV32 %s
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivq-to-divw < %s | FileCheck -check-prefix=DIV64 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivq-to-divl < %s | FileCheck -check-prefix=DIV64 %s
  
  define i32 @div32(i32 %a, i32 %b) {
  entry:
@@ -16,11 +16,12 @@ entry:
  define i64 @div64(i64 %a, i64 %b) {
  entry:
  ; DIV32-LABEL: div64:
-; DIV32-NOT: divw
+; DIV32-NOT: divl
  ; DIV64-LABEL: div64:
-; DIV64: orq %{{.*}}, [[REG:%[a-z]+]]
-; DIV64: testq   $-65536, [[REG]]
-; DIV64: divw
+; DIV64-DAG: movabsq $-4294967296, [[REGMSK:%[a-z]+]]
+; DIV64-DAG: orq %{{.*}}, [[REG:%[a-z]+]]
+; DIV64: testq [[REGMSK]], [[REG]]
+; DIV64: divl
    %div = sdiv i64 %a, %b
    ret i64 %div
  }
author	Nikolai Bozhenov <nikolai.bozhenov@intel.com>
	Thu, 12 Jan 2017 19:34:15 +0000 (19:34 +0000)
committer	Nikolai Bozhenov <nikolai.bozhenov@intel.com>
	Thu, 12 Jan 2017 19:34:15 +0000 (19:34 +0000)
lib/Target/X86/X86.td		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86Subtarget.h		patch \| blob \| history
test/CodeGen/X86/atom-bypass-slow-division-64.ll		patch \| blob \| history
test/CodeGen/X86/slow-div.ll		patch \| blob \| history