[X86] Replace AND+IMM64 with SRL/SHL

author Nikolai Bozhenov <nikolai.bozhenov@intel.com>

Thu, 12 Jan 2017 19:54:27 +0000 (19:54 +0000)

committer Nikolai Bozhenov <nikolai.bozhenov@intel.com>

Thu, 12 Jan 2017 19:54:27 +0000 (19:54 +0000)
author Nikolai Bozhenov <nikolai.bozhenov@intel.com>
Thu, 12 Jan 2017 19:54:27 +0000 (19:54 +0000)
committer Nikolai Bozhenov <nikolai.bozhenov@intel.com>
Thu, 12 Jan 2017 19:54:27 +0000 (19:54 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index d2535a83091336aa425a1eee014fe5840cfb21ef..787dff99367e8f160f11143d6b31b33e1bc409c7 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16018,6 +16018,12 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
        }
    }
  
+  // Sometimes flags can be set either with an AND or with an SRL/SHL
+  // instruction. SRL/SHL variant should be preferred for masks longer than this
+  // number of bits.
+  const int ShiftToAndMaxMaskWidth = 32;
+  const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
+
    // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
    // which may be the result of a CAST.  We use the variable 'Op', which is the
    // non-casted variable when we check for possible users.
@@ -16066,7 +16072,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
      // If we have a constant logical shift that's only used in a comparison
      // against zero turn it into an equivalent AND. This allows turning it into
      // a TEST instruction later.
-    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
+    if (ZeroCheck && Op->hasOneUse() &&
          isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
        EVT VT = Op.getValueType();
        unsigned BitWidth = VT.getSizeInBits();
@@ -16076,7 +16082,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
        APInt Mask = ArithOp.getOpcode() == ISD::SRL
                         ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
                         : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
-      if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+      if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
          break;
        Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
                         DAG.getConstant(Mask, dl, VT));
@@ -16085,18 +16091,59 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
  
    case ISD::AND:
      // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
-    // because a TEST instruction will be better.
+    // because a TEST instruction will be better. However, AND should be
+    // preferred if the instruction can be combined into ANDN.
      if (!hasNonFlagsUse(Op)) {
        SDValue Op0 = ArithOp->getOperand(0);
        SDValue Op1 = ArithOp->getOperand(1);
        EVT VT = ArithOp.getValueType();
        bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
        bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
+      bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
+
+      // If we cannot select an ANDN instruction, check if we can replace
+      // AND+IMM64 with a shift before giving up. This is possible for masks
+      // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
+      if (!isProperAndn) {
+        if (!ZeroCheck)
+          break;
+
+        assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
+        auto *CN = dyn_cast<ConstantSDNode>(Op1);
+        if (!CN)
+          break;
+
+        const APInt &Mask = CN->getAPIntValue();
+        if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
+          break; // Prefer TEST instruction.
+
+        unsigned BitWidth = Mask.getBitWidth();
+        unsigned LeadingOnes = Mask.countLeadingOnes();
+        unsigned TrailingZeros = Mask.countTrailingZeros();
+
+        if (LeadingOnes + TrailingZeros == BitWidth) {
+          assert(TrailingZeros < VT.getSizeInBits() &&
+                 "Shift amount should be less than the type width");
+          MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+          SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
+          Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
+          break;
+        }
+
+        unsigned LeadingZeros = Mask.countLeadingZeros();
+        unsigned TrailingOnes = Mask.countTrailingOnes();
+
+        if (LeadingZeros + TrailingOnes == BitWidth) {
+          assert(LeadingZeros < VT.getSizeInBits() &&
+                 "Shift amount should be less than the type width");
+          MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+          SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
+          Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
+          break;
+        }
  
-      // But if we can combine this into an ANDN operation, then create an AND
-      // now and allow it to be pattern matched into an ANDN.
-      if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
          break;
+      }
      }
      LLVM_FALLTHROUGH;
    case ISD::SUB:
@@ -16116,7 +16163,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
      case ISD::XOR: Opcode = X86ISD::XOR; break;
      case ISD::AND: Opcode = X86ISD::AND; break;
      case ISD::OR: {
-      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+      if (!NeedTruncation && ZeroCheck) {
          if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
            return EFLAGS;
        }
diff --git a/test/CodeGen/X86/bypass-slow-division-64.ll b/test/CodeGen/X86/bypass-slow-division-64.ll

index 35ae962a698319757586fd5d745ee49bf539a360..b067f9e1503c611c98c8bbe3893884f544a7004d 100644 (file)
--- a/test/CodeGen/X86/bypass-slow-division-64.ll
+++ b/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -9,8 +9,7 @@ define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
  ; CHECK:       # BB#0:
  ; CHECK-NEXT:    movq %rdi, %rax
  ; CHECK-NEXT:    orq %rsi, %rax
-; CHECK-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; CHECK-NEXT:    testq %rcx, %rax
+; CHECK-NEXT:    shrq $32, %rax
  ; CHECK-NEXT:    je .LBB0_1
  ; CHECK-NEXT:  # BB#2:
  ; CHECK-NEXT:    movq %rdi, %rax
@@ -32,8 +31,7 @@ define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
  ; CHECK:       # BB#0:
  ; CHECK-NEXT:    movq %rdi, %rax
  ; CHECK-NEXT:    orq %rsi, %rax
-; CHECK-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; CHECK-NEXT:    testq %rcx, %rax
+; CHECK-NEXT:    shrq $32, %rax
  ; CHECK-NEXT:    je .LBB1_1
  ; CHECK-NEXT:  # BB#2:
  ; CHECK-NEXT:    movq %rdi, %rax
@@ -57,8 +55,7 @@ define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
  ; CHECK:       # BB#0:
  ; CHECK-NEXT:    movq %rdi, %rax
  ; CHECK-NEXT:    orq %rsi, %rax
-; CHECK-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; CHECK-NEXT:    testq %rcx, %rax
+; CHECK-NEXT:    shrq $32, %rax
  ; CHECK-NEXT:    je .LBB2_1
  ; CHECK-NEXT:  # BB#2:
  ; CHECK-NEXT:    movq %rdi, %rax
diff --git a/test/CodeGen/X86/bypass-slow-division-tune.ll b/test/CodeGen/X86/bypass-slow-division-tune.ll

index 56cca0de8e613e65058a3cb0fc9b653b8005c88b..b6a53130cf23e40a981d831e0f34214e0ef1e1b3 100644 (file)
--- a/test/CodeGen/X86/bypass-slow-division-tune.ll
+++ b/test/CodeGen/X86/bypass-slow-division-tune.ll
@@ -22,9 +22,8 @@ entry:
  define i64 @div64(i64 %a, i64 %b) {
  entry:
  ; CHECK-LABEL: div64:
-; CHECK-DAG: movabsq $-4294967296, [[REGMSK:%[a-z]+]]
-; CHECK-DAG: orq     %{{.*}}, [[REG:%[a-z]+]]
-; CHECK:     testq   [[REGMSK]], [[REG]]
+; CHECK:     orq     %{{.*}}, [[REG:%[a-z]+]]
+; CHECK:     shrq    $32, [[REG]]
  ; CHECK:     divl
  ;
    %div = sdiv i64 %a, %b
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll

index d24f27ddf22c7393587f00450f5046ae65f41d9a..5d05c699f431192bea1041e4b357dbb9fc24d1c2 100644 (file)
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -281,4 +281,54 @@ define void @test20(i32 %bf.load, i8 %x1, i8* %b_addr) {
  ; CHECK: setne
  ; CHECK: testl
  ; CHECK: setne
-}
-\ No newline at end of file
+}
+
+define i32 @test21(i64 %val) {
+  %and = and i64 %val, -2199023255552 ; 0xFFFFFE0000000000
+  %cmp = icmp ne i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test21
+; CHECK: shrq $41, %rdi
+; CHECK-NOT: test
+; CHECK: setne %al
+; CHECK: retq
+}
+
+; AND-to-SHR transformation is enabled for eq/ne condition codes only.
+define i32 @test22(i64 %val) {
+  %and = and i64 %val, -2199023255552 ; 0xFFFFFE0000000000
+  %cmp = icmp ult i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test22
+; CHECK-NOT: shrq $41
+; CHECK: retq
+}
+
+define i32 @test23(i64 %val) {
+  %and = and i64 %val, -1048576 ; 0xFFFFFFFFFFF00000
+  %cmp = icmp ne i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test23
+; CHECK: testq $-1048576, %rdi
+; CHECK: setne %al
+; CHECK: retq
+}
+
+define i32 @test24(i64 %val) {
+  %and = and i64 %val, 281474976710655 ; 0x0000FFFFFFFFFFFF
+  %cmp = icmp ne i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test24
+; CHECK: shlq $16, %rdi
+; CHECK-NOT: test
+; CHECK: setne %al
+; CHECK: retq
+}
author	Nikolai Bozhenov <nikolai.bozhenov@intel.com>
	Thu, 12 Jan 2017 19:54:27 +0000 (19:54 +0000)
committer	Nikolai Bozhenov <nikolai.bozhenov@intel.com>
	Thu, 12 Jan 2017 19:54:27 +0000 (19:54 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/bypass-slow-division-64.ll		patch \| blob \| history
test/CodeGen/X86/bypass-slow-division-tune.ll		patch \| blob \| history
test/CodeGen/X86/cmp.ll		patch \| blob \| history