[X86] Improved X86::CMOV to Branch heuristic.

author Amjad Aboud <amjad.aboud@intel.com>

Tue, 8 Aug 2017 12:17:56 +0000 (12:17 +0000)

committer Amjad Aboud <amjad.aboud@intel.com>

Tue, 8 Aug 2017 12:17:56 +0000 (12:17 +0000)
author Amjad Aboud <amjad.aboud@intel.com>
Tue, 8 Aug 2017 12:17:56 +0000 (12:17 +0000)
committer Amjad Aboud <amjad.aboud@intel.com>
Tue, 8 Aug 2017 12:17:56 +0000 (12:17 +0000)
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp

index bfc834435de551e786cebe8fcc663413d795767f..b9b91d629db9276661260dd744af9540ea04ca38 100644 (file)
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -67,6 +67,11 @@ static cl::opt<bool>
                          cl::desc("Enable the X86 cmov-to-branch optimization."),
                          cl::init(true), cl::Hidden);
  
+static cl::opt<unsigned>
+    GainCycleThreshold("x86-cmov-converter-threshold",
+                       cl::desc("Minimum gain per loop (in cycles) threshold."),
+                       cl::init(4), cl::Hidden);
+
  /// Converts X86 cmov instructions into branches when profitable.
  class X86CmovConverterPass : public MachineFunctionPass {
  public:
@@ -389,19 +394,28 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
    //           Critical-path is iteration dependent - there is dependency of
    //           critical-path instructions on critical-path instructions of
    //           previous iteration.
-  //           Thus, it is required to check the gradient of the gain - the
-  //           change in Depth-Diff compared to the change in Loop-Depth between
-  //           1st and 2nd iterations.
+  //           Thus, check the gain percent of the 2nd iteration (similar to the
+  //           previous case), but it is also required to check the gradient of
+  //           the gain - the change in Depth-Diff compared to the change in
+  //           Loop-Depth between 1st and 2nd iterations.
    //           To be conservative, the gradient need to be at least 50%.
    //
+  //   In addition, In order not to optimize loops with very small gain, the
+  //   gain (in cycles) after 2nd iteration should not be less than a given
+  //   threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
+  //
    // If loop is not worth optimizing, remove all CMOV-group-candidates.
    //===--------------------------------------------------------------------===//
+  if (Diff[1] < GainCycleThreshold)
+    return false;
+
    bool WorthOptLoop = false;
    if (Diff[1] == Diff[0])
      WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
    else if (Diff[1] > Diff[0])
      WorthOptLoop =
-        (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth);
+        (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&
+        (Diff[1] * 8 >= LoopDepth[1].Depth);
  
    if (!WorthOptLoop)
      return false;
diff --git a/test/CodeGen/X86/pr33954.ll b/test/CodeGen/X86/pr33954.ll

new file mode 100644 (file)

index 0000000..084873a
--- /dev/null
+++ b/test/CodeGen/X86/pr33954.ll
@@ -0,0 +1,91 @@
+; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; This test checks that x86-cmov-converter optimization does not transform CMOV
+;; instruction when the gain (in cycles) of converting to branch is less than
+;; a fix threshold (measured for "-x86-cmov-converter-threshold=4").
+;;
+;; Test was created using the following command line:
+;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
+;; Where foo.c is:
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;int bar(int *a, int *b, int n) {
+;;  int sum = 0;
+;;  for (int i = 0; i < n; ++i) {
+;;    int x = a[i] * a[i+1] * a[i+2];
+;;    int y = b[i] * b[i+1];
+;;    sum += y > x ? x : 0;
+;;  }
+;;  return sum;
+;;}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Adding a test to the above function shows code with CMOV is 25% faster than
+;; the code with branch.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;#define N 10000
+;;int A[N];
+;;int B[N];
+;;
+;;
+;;
+;;int main () {
+;;  for (int i=0; i< N; ++i) {
+;;    A[i] = i%4;
+;;    B[i] = i%5;
+;;  }
+;;  int sum = 0;
+;;  for (int i=0; i< N*10; ++i)
+;;    sum += bar(A, B, N);
+;;  return sum;
+;;}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK-NOT: jg
+; CHECK: cmovle
+define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp30 = icmp sgt i32 %n, 0
+  br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %.pre = load i32, i32* %a, align 4
+  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 1
+  %.pre34 = load i32, i32* %arrayidx2.phi.trans.insert, align 4
+  %.pre35 = load i32, i32* %b, align 4
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ]
+  ret i32 %sum.0.lcssa
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ]
+  %1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ]
+  %2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %mul = mul nsw i32 %1, %2
+  %3 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %3
+  %4 = load i32, i32* %arrayidx5, align 4
+  %mul6 = mul nsw i32 %mul, %4
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %5 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %5, %0
+  %cmp13 = icmp sgt i32 %mul12, %mul6
+  %cond = select i1 %cmp13, i32 %mul6, i32 0
+  %add14 = add nsw i32 %cond, %sum.032
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = {"target-cpu"="skylake"}
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{!"clang version 5.0.0 (trunk)"}
diff --git a/test/CodeGen/X86/x86-cmov-converter.ll b/test/CodeGen/X86/x86-cmov-converter.ll

index 39877c14429ffc8238805c35a6c26c898700f34d..1837a8d498517c3080a3c0eb29799d16ea5b0bd9 100644 (file)
--- a/test/CodeGen/X86/x86-cmov-converter.ll
+++ b/test/CodeGen/X86/x86-cmov-converter.ll
@@ -3,13 +3,13 @@
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;; This test checks that x86-cmov-converter optimization transform CMOV
  ;; instruction into branches when it is profitable.
-;; There are 5 cases below:
+;; There are 6 cases below:
  ;;   1. CmovInCriticalPath:
  ;;        CMOV depends on the condition and it is in the hot path.
  ;;        Thus, it worths transforming.
  ;;
  ;;   2. CmovNotInCriticalPath:
-;;        similar test like in (1), just that CMOV is not in the hot path.
+;;        Similar test like in (1), just that CMOV is not in the hot path.
  ;;        Thus, it does not worth transforming.
  ;;
  ;;   3. MaxIndex:
@@ -26,16 +26,21 @@
  ;;        Usually, binary search CMOV is not predicted.
  ;;        Thus, it does not worth transforming.
  ;;
+;;   6. SmallGainPerLoop:
+;;        The gain percentage from converting CMOV into branch is acceptable,
+;;        however, the absolute gain is smaller than a threshold.
+;;        Thus, it does not worth transforming.
+;;
  ;; Test was created using the following command line:
  ;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
  ;; Where foo.c is:
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;;void CmovInHotPath(int n, int a, int b, int *c, int *d) {
  ;;  for (int i = 0; i < n; i++) {
-;;    int t = c[i];
+;;    int t = c[i] + 1;
  ;;    if (c[i] * a > b)
  ;;      t = 10;
-;;    c[i] = t;
+;;    c[i] = (c[i] + 1) * t;
  ;;  }
  ;;}
  ;;
@@ -87,6 +92,16 @@
  ;;  }
  ;;  return Curr->Val;
  ;;}
+;;
+;;
+;;void SmallGainPerLoop(int n, int a, int b, int *c, int *d) {
+;;  for (int i = 0; i < n; i++) {
+;;    int t = c[i];
+;;    if (c[i] * a > b)
+;;      t = 10;
+;;    c[i] = t;
+;;  }
+;;}
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  
  %struct.Node = type { i32, %struct.Node*, %struct.Node* }
@@ -111,10 +126,12 @@ for.body:                                         ; preds = %for.body.preheader,
    %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
    %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
    %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 1
    %mul = mul nsw i32 %0, %a
    %cmp3 = icmp sgt i32 %mul, %b
-  %. = select i1 %cmp3, i32 10, i32 %0
-  store i32 %., i32* %arrayidx, align 4
+  %. = select i1 %cmp3, i32 10, i32 %add
+  %mul7 = mul nsw i32 %., %add
+  store i32 %mul7, i32* %arrayidx, align 4
    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
    %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
    br i1 %exitcond, label %for.cond.cleanup, label %for.body
@@ -253,6 +270,35 @@ while.end:                                        ; preds = %while.body, %entry
    ret i32 %.lcssa
  }
  
+; CHECK-LABEL: SmallGainPerLoop
+; CHECK-NOT: jg
+; CHECK: cmovg
+
+define void @SmallGainPerLoop(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 {
+entry:
+  %cmp14 = icmp sgt i32 %n, 0
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, %a
+  %cmp3 = icmp sgt i32 %mul, %b
+  %. = select i1 %cmp3, i32 10, i32 %0
+  store i32 %., i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;; The following test checks that x86-cmov-converter optimization transforms
  ;; CMOV instructions into branch correctly.
author	Amjad Aboud <amjad.aboud@intel.com>
	Tue, 8 Aug 2017 12:17:56 +0000 (12:17 +0000)
committer	Amjad Aboud <amjad.aboud@intel.com>
	Tue, 8 Aug 2017 12:17:56 +0000 (12:17 +0000)
lib/Target/X86/X86CmovConversion.cpp		patch \| blob \| history
test/CodeGen/X86/pr33954.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/x86-cmov-converter.ll		patch \| blob \| history