return DAG.getBitcast(N->getValueType(0), Shift);
}
+// Get the index node from the lowered DAG of a GEP IR instruction with one
+// indexing dimension.
+static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
+ if (Ld->isIndexed())
+ return SDValue();
+
+ SDValue Base = Ld->getBasePtr();
+
+ if (Base.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ SDValue ShiftedIndex = Base.getOperand(0);
+
+ if (ShiftedIndex.getOpcode() != ISD::SHL)
+ return SDValue();
+
+ return ShiftedIndex.getOperand(0);
+
+}
+
+static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
+ if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
+ switch (VT.getSizeInBits()) {
+ default: return false;
+ case 64: return Subtarget.is64Bit() ? true : false;
+ case 32: return true;
+ }
+ }
+ return false;
+}
+
+// This function recognizes cases where X86 bzhi instruction can replace and
+// 'and-load' sequence.
+// In case of loading integer value from an array of constants which is defined
+// as follows:
+//
+// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
+//
+// then applying a bitwise and on the result with another input.
+// It's equivalent to performing bzhi (zero high bits) on the input, with the
+// same index of the load.
+static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Node->getSimpleValueType(0);
+ SDLoc dl(Node);
+
+ // Check if subtarget has BZHI instruction for the node's type
+ if (!hasBZHI(Subtarget, VT))
+ return SDValue();
+
+ // Try matching the pattern for both operands.
+ for (unsigned i = 0; i < 2; i++) {
+ SDValue N = Node->getOperand(i);
+ LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
+
+ // continue if the operand is not a load instruction
+ if (!Ld)
+ return SDValue();
+
+ const Value *MemOp = Ld->getMemOperand()->getValue();
+
+ if (!MemOp)
+ return SDValue();
+
+ if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
+ if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
+
+ Constant *Init = GV->getInitializer();
+ Type *Ty = Init->getType();
+ if (!isa<ConstantDataArray>(Init) ||
+ !Ty->getArrayElementType()->isIntegerTy() ||
+ Ty->getArrayElementType()->getScalarSizeInBits() !=
+ VT.getSizeInBits() ||
+ Ty->getArrayNumElements() >
+ Ty->getArrayElementType()->getScalarSizeInBits())
+ continue;
+
+ // Check if the array's constant elements are suitable to our case.
+ uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
+ bool ConstantsMatch = true;
+ for (uint64_t j = 0; j < ArrayElementCount; j++) {
+ ConstantInt *Elem =
+ dyn_cast<ConstantInt>(Init->getAggregateElement(j));
+ if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
+ ConstantsMatch = false;
+ break;
+ }
+ }
+ if (!ConstantsMatch)
+ continue;
+
+ // Do the transformation (For 32-bit type):
+ // -> (and (load arr[idx]), inp)
+ // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
+ // that will be replaced with one bzhi instruction.
+ SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
+ SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
+
+ // Get the Node which indexes into the array.
+ SDValue Index = getIndexFromUnindexedLoad(Ld);
+ if (!Index)
+ return SDValue();
+ Index = DAG.getZExtOrTrunc(Index, dl, VT);
+
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
+
+ SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
+ SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
+
+ return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
return ShiftRight;
+ if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
+ return R;
+
// Attempt to recursively combine a bitmask AND with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
define i32 @f32_bzhi(i32 %x, i32 %y) local_unnamed_addr {
; CHECK-LABEL: f32_bzhi:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movslq %esi, %rax
-; CHECK-NEXT: andl fill_table32(,%rax,4), %edi
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: ret{{[l|q]}}
+; CHECK-NEXT: bzhil %esi, %edi, %eax
+; CHECK-NEXT: retq
;
; CHECK32-LABEL: f32_bzhi:
; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT: movl fill_table32(,%eax,4), %eax
-; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT: ret{{[l|q]}}
+; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT: retl
entry:
%idxprom = sext i32 %y to i64
%arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @fill_table32, i64 0, i64 %idxprom
define i32 @f32_bzhi_partial(i32 %x, i32 %y) local_unnamed_addr {
; CHECK-LABEL: f32_bzhi_partial:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movslq %esi, %rax
-; CHECK-NEXT: andl fill_table32_partial(,%rax,4), %edi
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: ret{{[l|q]}}
+; CHECK-NEXT: bzhil %esi, %edi, %eax
+; CHECK-NEXT: retq
;
; CHECK32-LABEL: f32_bzhi_partial:
; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT: movl fill_table32_partial(,%eax,4), %eax
-; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT: ret{{[l|q]}}
+; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT: retl
entry:
%idxprom = sext i32 %y to i64
%arrayidx = getelementptr inbounds [17 x i32], [17 x i32]* @fill_table32_partial, i64 0, i64 %idxprom
define i64 @f64_bzhi(i64 %x, i64 %y) local_unnamed_addr {
; CHECK-LABEL: f64_bzhi:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: andq fill_table64(,%rsi,8), %rdi
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: ret{{[l|q]}}
+; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
;
; CHECK32-LABEL: f64_bzhi:
; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movl fill_table64(,%eax,8), %eax
; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %edx
-; CHECK32-NEXT: ret{{[l|q]}}
+; CHECK32-NEXT: retl
entry:
%arrayidx = getelementptr inbounds [64 x i64], [64 x i64]* @fill_table64, i64 0, i64 %y
%0 = load i64, i64* %arrayidx, align 8
define i64 @f64_bzhi_partial(i64 %x, i64 %y) local_unnamed_addr {
; CHECK-LABEL: f64_bzhi_partial:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: andq fill_table64_partial(,%rsi,8), %rdi
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: ret{{[l|q]}}
+; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
;
; CHECK32-LABEL: f64_bzhi_partial:
; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movl fill_table64_partial(,%eax,8), %eax
; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %edx
-; CHECK32-NEXT: ret{{[l|q]}}
+; CHECK32-NEXT: retl
entry:
%arrayidx = getelementptr inbounds [51 x i64], [51 x i64]* @fill_table64_partial, i64 0, i64 %y
%0 = load i64, i64* %arrayidx, align 8