From 7b4bd48edb52987bc4c36e57422572fe99606449 Mon Sep 17 00:00:00 2001
From: Nikolai Bozhenov <nikolai.bozhenov@intel.com>
Date: Thu, 12 Jan 2017 19:54:27 +0000
Subject: [PATCH] [X86] Replace AND+IMM64 with SRL/SHL

Emit SHRQ/SHLQ instead of ANDQ with a 64 bit constant mask if the result
is unused and the mask has only higher/lower bits set. For example, with
this patch LLVM emits

  shrq $41, %rdi
  je

instead of

  movabsq $0xFFFFFE0000000000, %rcx
  testq   %rcx, %rdi
  je

This reduces number of instructions, code size and register pressure.
The transformation is applied only for cases where the mask cannot be
encoded as an immediate value within TESTQ instruction.

Differential Revision: https://reviews.llvm.org/D28198


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@291806 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp            | 61 ++++++++++++++++---
 test/CodeGen/X86/bypass-slow-division-64.ll   |  9 +--
 test/CodeGen/X86/bypass-slow-division-tune.ll |  5 +-
 test/CodeGen/X86/cmp.ll                       | 52 +++++++++++++++-
 4 files changed, 110 insertions(+), 17 deletions(-)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index d2535a83091..787dff99367 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16018,6 +16018,12 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
       }
   }
 
+  // Sometimes flags can be set either with an AND or with an SRL/SHL
+  // instruction. SRL/SHL variant should be preferred for masks longer than this
+  // number of bits.
+  const int ShiftToAndMaxMaskWidth = 32;
+  const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
+
   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
   // which may be the result of a CAST.  We use the variable 'Op', which is the
   // non-casted variable when we check for possible users.
@@ -16066,7 +16072,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
     // If we have a constant logical shift that's only used in a comparison
     // against zero turn it into an equivalent AND. This allows turning it into
     // a TEST instruction later.
-    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
+    if (ZeroCheck && Op->hasOneUse() &&
         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
       EVT VT = Op.getValueType();
       unsigned BitWidth = VT.getSizeInBits();
@@ -16076,7 +16082,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
       APInt Mask = ArithOp.getOpcode() == ISD::SRL
                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
-      if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+      if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
         break;
       Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
                        DAG.getConstant(Mask, dl, VT));
@@ -16085,18 +16091,59 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
 
   case ISD::AND:
     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
-    // because a TEST instruction will be better.
+    // because a TEST instruction will be better. However, AND should be
+    // preferred if the instruction can be combined into ANDN.
     if (!hasNonFlagsUse(Op)) {
       SDValue Op0 = ArithOp->getOperand(0);
       SDValue Op1 = ArithOp->getOperand(1);
       EVT VT = ArithOp.getValueType();
       bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
       bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
+      bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
+
+      // If we cannot select an ANDN instruction, check if we can replace
+      // AND+IMM64 with a shift before giving up. This is possible for masks
+      // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
+      if (!isProperAndn) {
+        if (!ZeroCheck)
+          break;
+
+        assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
+        auto *CN = dyn_cast<ConstantSDNode>(Op1);
+        if (!CN)
+          break;
+
+        const APInt &Mask = CN->getAPIntValue();
+        if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
+          break; // Prefer TEST instruction.
+
+        unsigned BitWidth = Mask.getBitWidth();
+        unsigned LeadingOnes = Mask.countLeadingOnes();
+        unsigned TrailingZeros = Mask.countTrailingZeros();
+
+        if (LeadingOnes + TrailingZeros == BitWidth) {
+          assert(TrailingZeros < VT.getSizeInBits() &&
+                 "Shift amount should be less than the type width");
+          MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+          SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
+          Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
+          break;
+        }
+
+        unsigned LeadingZeros = Mask.countLeadingZeros();
+        unsigned TrailingOnes = Mask.countTrailingOnes();
+
+        if (LeadingZeros + TrailingOnes == BitWidth) {
+          assert(LeadingZeros < VT.getSizeInBits() &&
+                 "Shift amount should be less than the type width");
+          MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+          SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
+          Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
+          break;
+        }
 
-      // But if we can combine this into an ANDN operation, then create an AND
-      // now and allow it to be pattern matched into an ANDN.
-      if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
         break;
+      }
     }
     LLVM_FALLTHROUGH;
   case ISD::SUB:
@@ -16116,7 +16163,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
     case ISD::XOR: Opcode = X86ISD::XOR; break;
     case ISD::AND: Opcode = X86ISD::AND; break;
     case ISD::OR: {
-      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+      if (!NeedTruncation && ZeroCheck) {
         if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
           return EFLAGS;
       }
diff --git a/test/CodeGen/X86/bypass-slow-division-64.ll b/test/CodeGen/X86/bypass-slow-division-64.ll
index 35ae962a698..b067f9e1503 100644
--- a/test/CodeGen/X86/bypass-slow-division-64.ll
+++ b/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -9,8 +9,7 @@ define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    orq %rsi, %rax
-; CHECK-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; CHECK-NEXT:    testq %rcx, %rax
+; CHECK-NEXT:    shrq $32, %rax
 ; CHECK-NEXT:    je .LBB0_1
 ; CHECK-NEXT:  # BB#2:
 ; CHECK-NEXT:    movq %rdi, %rax
@@ -32,8 +31,7 @@ define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    orq %rsi, %rax
-; CHECK-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; CHECK-NEXT:    testq %rcx, %rax
+; CHECK-NEXT:    shrq $32, %rax
 ; CHECK-NEXT:    je .LBB1_1
 ; CHECK-NEXT:  # BB#2:
 ; CHECK-NEXT:    movq %rdi, %rax
@@ -57,8 +55,7 @@ define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    orq %rsi, %rax
-; CHECK-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; CHECK-NEXT:    testq %rcx, %rax
+; CHECK-NEXT:    shrq $32, %rax
 ; CHECK-NEXT:    je .LBB2_1
 ; CHECK-NEXT:  # BB#2:
 ; CHECK-NEXT:    movq %rdi, %rax
diff --git a/test/CodeGen/X86/bypass-slow-division-tune.ll b/test/CodeGen/X86/bypass-slow-division-tune.ll
index 56cca0de8e6..b6a53130cf2 100644
--- a/test/CodeGen/X86/bypass-slow-division-tune.ll
+++ b/test/CodeGen/X86/bypass-slow-division-tune.ll
@@ -22,9 +22,8 @@ entry:
 define i64 @div64(i64 %a, i64 %b) {
 entry:
 ; CHECK-LABEL: div64:
-; CHECK-DAG: movabsq $-4294967296, [[REGMSK:%[a-z]+]]
-; CHECK-DAG: orq     %{{.*}}, [[REG:%[a-z]+]]
-; CHECK:     testq   [[REGMSK]], [[REG]]
+; CHECK:     orq     %{{.*}}, [[REG:%[a-z]+]]
+; CHECK:     shrq    $32, [[REG]]
 ; CHECK:     divl
 ;
   %div = sdiv i64 %a, %b
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index d24f27ddf22..5d05c699f43 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -281,4 +281,54 @@ define void @test20(i32 %bf.load, i8 %x1, i8* %b_addr) {
 ; CHECK: setne
 ; CHECK: testl
 ; CHECK: setne
-}
\ No newline at end of file
+}
+
+define i32 @test21(i64 %val) {
+  %and = and i64 %val, -2199023255552 ; 0xFFFFFE0000000000
+  %cmp = icmp ne i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test21
+; CHECK: shrq $41, %rdi
+; CHECK-NOT: test
+; CHECK: setne %al
+; CHECK: retq
+}
+
+; AND-to-SHR transformation is enabled for eq/ne condition codes only.
+define i32 @test22(i64 %val) {
+  %and = and i64 %val, -2199023255552 ; 0xFFFFFE0000000000
+  %cmp = icmp ult i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test22
+; CHECK-NOT: shrq $41
+; CHECK: retq
+}
+
+define i32 @test23(i64 %val) {
+  %and = and i64 %val, -1048576 ; 0xFFFFFFFFFFF00000
+  %cmp = icmp ne i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test23
+; CHECK: testq $-1048576, %rdi
+; CHECK: setne %al
+; CHECK: retq
+}
+
+define i32 @test24(i64 %val) {
+  %and = and i64 %val, 281474976710655 ; 0x0000FFFFFFFFFFFF
+  %cmp = icmp ne i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test24
+; CHECK: shlq $16, %rdi
+; CHECK-NOT: test
+; CHECK: setne %al
+; CHECK: retq
+}
-- 
2.50.1