return SDValue();
}
+static SDValue PerformSHLSimplify(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *ST) {
+ // Allow the generic combiner to identify potential bswaps.
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ // DAG combiner will fold:
+ // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
+ // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
+ // Other code patterns that can be also be modified have the following form:
+ // b + ((a << 1) | 510)
+ // b + ((a << 1) & 510)
+ // b + ((a << 1) ^ 510)
+ // b + ((a << 1) + 510)
+
+ // Many instructions can perform the shift for free, but it requires both
+ // the operands to be registers. If c1 << c2 is too large, a mov immediate
+ // instruction will needed. So, unfold back to the original pattern if:
+ // - if c1 and c2 are small enough that they don't require mov imms.
+ // - the user(s) of the node can perform an shl
+
+ // No shifted operands for 16-bit instructions.
+ if (ST->isThumb() && ST->isThumb1Only())
+ return SDValue();
+
+ // Check that all the users could perform the shl themselves.
+ for (auto U : N->uses()) {
+ switch(U->getOpcode()) {
+ default:
+ return SDValue();
+ case ISD::SUB:
+ case ISD::ADD:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::SETCC:
+ case ARMISD::CMP:
+ // Check that its not already using a shl.
+ if (U->getOperand(0).getOpcode() == ISD::SHL ||
+ U->getOperand(1).getOpcode() == ISD::SHL)
+ return SDValue();
+ break;
+ }
+ }
+
+ if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
+ N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
+ return SDValue();
+
+ if (N->getOperand(0).getOpcode() != ISD::SHL)
+ return SDValue();
+
+ SDValue SHL = N->getOperand(0);
+
+ auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+ if (!C1ShlC2 || !C2)
+ return SDValue();
+
+ DEBUG(dbgs() << "Trying to simplify shl: "; N->dump());
+
+ APInt C2Int = C2->getAPIntValue();
+ APInt C1Int = C1ShlC2->getAPIntValue();
+
+ // Check that performing a lshr will not lose any information.
+ APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
+ C2Int.getBitWidth() - C2->getZExtValue());
+ if ((C1Int & Mask) != C1Int)
+ return SDValue();
+
+ // Shift the first constant.
+ C1Int.lshrInPlace(C2Int);
+
+ // The immediates are encoded as an 8-bit value that can be rotated.
+ unsigned Zeros = C1Int.countLeadingZeros() + C1Int.countTrailingZeros();
+ if (C1Int.getBitWidth() - Zeros > 8)
+ return SDValue();
+
+ Zeros = C2Int.countLeadingZeros() + C2Int.countTrailingZeros();
+ if (C2Int.getBitWidth() - Zeros > 8)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+ SDValue X = SHL.getOperand(0);
+ SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
+ DAG.getConstant(C1Int, dl, MVT::i32));
+ // Shift left to compensate for the lshr of C1Int.
+ SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
+
+ DAG.ReplaceAllUsesWith(SDValue(N, 0), Res);
+ return SDValue(N, 0);
+}
+
+
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
///
static SDValue PerformADDCombine(SDNode *N,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ // Only works one way, because it needs an immediate operand.
+ if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
+ return Result;
+
// First try with the default operand order.
if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
return Result;
// fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
return Result;
+
+ if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
+ return Result;
}
return SDValue();
return Result;
}
- // The code below optimizes (or (and X, Y), Z).
- // The AND operand needs to have a single user to make these optimizations
- // profitable.
SDValue N0 = N->getOperand(0);
- if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
- return SDValue();
SDValue N1 = N->getOperand(1);
// (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+
+ // The code below optimizes (or (and X, Y), Z).
+ // The AND operand needs to have a single user to make these optimizations
+ // profitable.
+ if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
+ return SDValue();
+
APInt SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
// Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
// reasonable.
- if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
- return Res;
+ if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
+ if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
+ return Res;
+ }
+
+ if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
+ return Result;
return SDValue();
}
// fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
return Result;
+
+ if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
+ return Result;
}
return SDValue();
--- /dev/null
+; RUN: llc -mtriple armv6t2 %s -o - | FileCheck %s
+; RUN: llc -mtriple thumbv6t2 %s -o - | FileCheck %s --check-prefix=CHECK-T2
+; RUN: llc -mtriple armv7 %s -o - | FileCheck %s
+; RUN: llc -mtriple thumbv7 %s -o - | FileCheck %s --check-prefix=CHECK-T2
+; RUN: llc -mtriple thumbv7m %s -o - | FileCheck %s --check-prefix=CHECK-T2
+; RUN: llc -mtriple thumbv8m.main %s -o - | FileCheck %s --check-prefix=CHECK-T2
+
+; CHECK-LABEL: unfold1
+; CHECK-NOT: mov
+; CHECK: orr r0, r0, #255
+; CHECK: add r0, r1, r0, lsl #1
+; CHECK-T2-NOT: mov
+; CHECK-T2: orr r0, r0, #255
+; CHECK-T2: add.w r0, r1, r0, lsl #1
+define arm_aapcscc i32 @unfold1(i32 %a, i32 %b) {
+entry:
+ %or = shl i32 %a, 1
+ %shl = or i32 %or, 510
+ %add = add nsw i32 %shl, %b
+ ret i32 %add
+}
+
+; CHECK-LABEL: unfold2
+; CHECK-NOT: mov
+; CHECK: orr r0, r0, #4080
+; CHECK: sub r0, r1, r0, lsl #2
+; CHECK-T2-NOT: mov
+; CHECK-T2: orr r0, r0, #4080
+; CHECK-T2: sub.w r0, r1, r0, lsl #2
+define arm_aapcscc i32 @unfold2(i32 %a, i32 %b) {
+entry:
+ %or = shl i32 %a, 2
+ %shl = or i32 %or, 16320
+ %sub = sub nsw i32 %b, %shl
+ ret i32 %sub
+}
+
+; CHECK-LABEL: unfold3
+; CHECK-NOT: mov
+; CHECK: orr r0, r0, #65280
+; CHECK: and r0, r1, r0, lsl #4
+; CHECK-T2-NOT: mov
+; CHECK-T2: orr r0, r0, #65280
+; CHECK-T2: and.w r0, r1, r0, lsl #4
+define arm_aapcscc i32 @unfold3(i32 %a, i32 %b) {
+entry:
+ %or = shl i32 %a, 4
+ %shl = or i32 %or, 1044480
+ %and = and i32 %shl, %b
+ ret i32 %and
+}
+
+; CHECK-LABEL: unfold4
+; CHECK-NOT: mov
+; CHECK: orr r0, r0, #1044480
+; CHECK: eor r0, r1, r0, lsl #5
+; CHECK-T2-NOT: mov
+; CHECK-T2: orr r0, r0, #1044480
+; CHECK-T2: eor.w r0, r1, r0, lsl #5
+define arm_aapcscc i32 @unfold4(i32 %a, i32 %b) {
+entry:
+ %or = shl i32 %a, 5
+ %shl = or i32 %or, 33423360
+ %xor = xor i32 %shl, %b
+ ret i32 %xor
+}
+
+; CHECK-LABEL: unfold5
+; CHECK-NOT: mov
+; CHECK: add r0, r0, #496
+; CHECK: orr r0, r1, r0, lsl #6
+; CHECK-T2: add.w r0, r0, #496
+; CHECK-T2: orr.w r0, r1, r0, lsl #6
+define arm_aapcscc i32 @unfold5(i32 %a, i32 %b) {
+entry:
+ %add = shl i32 %a, 6
+ %shl = add i32 %add, 31744
+ %or = or i32 %shl, %b
+ ret i32 %or
+}
+
+; CHECK-LABEL: unfold6
+; CHECK-NOT: mov
+; CHECK: add r0, r0, #7936
+; CHECK: and r0, r1, r0, lsl #8
+; CHECK-T2-NOT: mov
+; CHECK-T2: add.w r0, r0, #7936
+; CHECK-T2: and.w r0, r1, r0, lsl #8
+define arm_aapcscc i32 @unfold6(i32 %a, i32 %b) {
+entry:
+ %add = shl i32 %a, 8
+ %shl = add i32 %add, 2031616
+ %and = and i32 %shl, %b
+ ret i32 %and
+}
+
+; CHECK-LABEL: unfold7
+; CHECK-NOT: mov
+; CHECK: and r0, r0, #256
+; CHECK: add r0, r1, r0, lsl #1
+; CHECK-T2-NOT: mov
+; CHECK-T2: and r0, r0, #256
+; CHECK-T2: add.w r0, r1, r0, lsl #1
+define arm_aapcscc i32 @unfold7(i32 %a, i32 %b) {
+entry:
+ %shl = shl i32 %a, 1
+ %and = and i32 %shl, 512
+ %add = add nsw i32 %and, %b
+ ret i32 %add
+}
+
+; CHECK-LABEL: unfold8
+; CHECK-NOT: mov
+; CHECK: add r0, r0, #126976
+; CHECK: eor r0, r1, r0, lsl #9
+; CHECK-T2-NOT: mov
+; CHECK-T2: add.w r0, r0, #126976
+; CHECK-T2: eor.w r0, r1, r0, lsl #9
+define arm_aapcscc i32 @unfold8(i32 %a, i32 %b) {
+entry:
+ %add = shl i32 %a, 9
+ %shl = add i32 %add, 65011712
+ %xor = xor i32 %shl, %b
+ ret i32 %xor
+}
+
+; CHECK-LABEL: unfold9
+; CHECK-NOT: mov
+; CHECK: eor r0, r0, #255
+; CHECK: add r0, r1, r0, lsl #1
+; CHECK-T2-NOT: mov
+; CHECK-T2: eor r0, r0, #255
+; CHECK-T2: add.w r0, r1, r0, lsl #1
+define arm_aapcscc i32 @unfold9(i32 %a, i32 %b) {
+entry:
+ %shl = shl i32 %a, 1
+ %xor = xor i32 %shl, 510
+ %add = add nsw i32 %xor, %b
+ ret i32 %add
+}
+
+; CHECK-LABEL: unfold10
+; CHECK-NOT: mov r2
+; CHECK: orr r2, r0, #4080
+; CHECK: cmp r1, r2, lsl #10
+; CHECK-T2-NOT: mov.w r2
+; CHECK-T2: orr r2, r0, #4080
+; CHECK-T2: cmp.w r1, r2, lsl #10
+define arm_aapcscc i32 @unfold10(i32 %a, i32 %b) {
+entry:
+ %or = shl i32 %a, 10
+ %shl = or i32 %or, 4177920
+ %cmp = icmp sgt i32 %shl, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+; CHECK-LABEL: unfold11
+; CHECK-NOT: mov r2
+; CHECK: add r2, r0, #7936
+; CHECK: cmp r1, r2, lsl #11
+; CHECK-T2-NOT: mov.w r2
+; CHECK-T2: add.w r2, r0, #7936
+; CHECK-T2: cmp.w r1, r2, lsl #11
+define arm_aapcscc i32 @unfold11(i32 %a, i32 %b) {
+entry:
+ %add = shl i32 %a, 11
+ %shl = add i32 %add, 16252928
+ %cmp = icmp sgt i32 %shl, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+