unsigned MaxAlign = MFI->getMaxAlignment();
assert (!AFI->isThumb1OnlyFunction());
// Emit bic r6, r6, MaxAlign
+ assert(MaxAlign <= 256 && "The BIC instruction cannot encode "
+ "immediates larger than 256 with all lower "
+ "bits set.");
unsigned bicOpc = AFI->isThumbFunction() ?
ARM::t2BICri : ARM::BICri;
AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
};
}
+/// Emit an instruction sequence that will align the address in
+/// register Reg by zero-ing out the lower bits. For versions of the
+/// architecture that support Neon, this must be done in a single
+/// instruction, since skipAlignedDPRCS2Spills assumes it is done in a
+/// single instruction. That function only gets called when optimizing
+/// spilling of D registers on a core with the Neon instruction set
+/// present.
+static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
+ const TargetInstrInfo &TII,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, const unsigned Reg,
+ const unsigned Alignment,
+ const bool MustBeSingleInstruction) {
+ const ARMSubtarget &AST = MF.getTarget().getSubtarget<ARMSubtarget>();
+ const bool CanUseBFC = AST.hasV6T2Ops() || AST.hasV7Ops();
+ const unsigned AlignMask = Alignment - 1;
+ const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+ assert(!AFI->isThumb1OnlyFunction() && "Thumb1 not supported");
+ if (!AFI->isThumbFunction()) {
+ // if the BFC instruction is available, use that to zero the lower
+ // bits:
+ // bfc Reg, #0, log2(Alignment)
+ // otherwise use BIC, if the mask to zero the required number of bits
+ // can be encoded in the bic immediate field
+ // bic Reg, Reg, Alignment-1
+ // otherwise, emit
+ // lsr Reg, Reg, log2(Alignment)
+ // lsl Reg, Reg, log2(Alignment)
+ if (CanUseBFC) {
+ AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BFC), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(~AlignMask));
+ } else if (AlignMask <= 255) {
+ AddDefaultCC(
+ AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BICri), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(AlignMask)));
+ } else {
+ assert(!MustBeSingleInstruction &&
+ "Shouldn't call emitAligningInstructions demanding a single "
+ "instruction to be emitted for large stack alignment for a target "
+ "without BFC.");
+ AddDefaultCC(AddDefaultPred(
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(ARM_AM::getSORegOpc(ARM_AM::lsr, NrBitsToZero))));
+ AddDefaultCC(AddDefaultPred(
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, NrBitsToZero))));
+ }
+ } else {
+ // Since this is only reached for Thumb-2 targets, the BFC instruction
+ // should always be available.
+ assert(CanUseBFC);
+ AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::t2BFC), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(~AlignMask));
+ }
+}
+
void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
MachineBasicBlock &MBB = MF.front();
MachineBasicBlock::iterator MBBI = MBB.begin();
// realigned.
if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) {
unsigned MaxAlign = MFI->getMaxAlignment();
- assert (!AFI->isThumb1OnlyFunction());
+ assert(!AFI->isThumb1OnlyFunction());
if (!AFI->isThumbFunction()) {
- // Emit bic sp, sp, MaxAlign
- AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
- TII.get(ARM::BICri), ARM::SP)
- .addReg(ARM::SP, RegState::Kill)
- .addImm(MaxAlign-1)));
+ emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign,
+ false);
} else {
- // We cannot use sp as source/dest register here, thus we're emitting the
- // following sequence:
+ // We cannot use sp as source/dest register here, thus we're using r4 to
+ // perform the calculations. We're emitting the following sequence:
// mov r4, sp
- // bic r4, r4, MaxAlign
+ // -- use emitAligningInstructions to produce best sequence to zero
+ // -- out lower bits in r4
// mov sp, r4
// FIXME: It will be better just to find spare register here.
AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
- .addReg(ARM::SP, RegState::Kill));
- AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
- TII.get(ARM::t2BICri), ARM::R4)
- .addReg(ARM::R4, RegState::Kill)
- .addImm(MaxAlign-1)));
+ .addReg(ARM::SP, RegState::Kill));
+ emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign,
+ false);
AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
- .addReg(ARM::R4, RegState::Kill));
+ .addReg(ARM::R4, RegState::Kill));
}
AFI->setShouldRestoreSPFromFP(true);
// The immediate is <= 64, so it doesn't need any special encoding.
unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri;
AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
- .addReg(ARM::SP)
- .addImm(8 * NumAlignedDPRCS2Regs)));
+ .addReg(ARM::SP)
+ .addImm(8 * NumAlignedDPRCS2Regs)));
- // bic r4, r4, #align-1
- Opc = isThumb ? ARM::t2BICri : ARM::BICri;
unsigned MaxAlign = MF.getFrameInfo()->getMaxAlignment();
- AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
- .addReg(ARM::R4, RegState::Kill)
- .addImm(MaxAlign - 1)));
+ // We must set parameter MustBeSingleInstruction to true, since
+ // skipAlignedDPRCS2Spills expects exactly 3 instructions to perform
+ // stack alignment. Luckily, this can always be done since all ARM
+ // architecture versions that support Neon also support the BFC
+ // instruction.
+ emitAligningInstructions(MF, AFI, TII, MBB, MI, DL, ARM::R4, MaxAlign, true);
// mov sp, r4
// The stack pointer must be adjusted before spilling anything, otherwise
define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp {
entry:
; REALIGN-LABEL: test2
-; REALIGN: bic sp, sp, #63
+; REALIGN: bfc sp, #0, #6
; REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
; REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; CHECK-IOS-LABEL: check_vfp_fold:
; CHECK-IOS: push {r0, r1, r2, r3, r4, r7, lr}
; CHECK-IOS: sub.w r4, sp, #16
-; CHECK-IOS: bic r4, r4, #15
+; CHECK-IOS: bfc r4, #0, #4
; CHECK-IOS: mov sp, r4
; CHECK-IOS: vst1.64 {d8, d9}, [r4:128]
; ...
; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
; CHECK-A: add r11, sp, #20
; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
-; CHECK-A: bic sp, sp, #7
+; CHECK-A: bfc sp, #0, #3
; CHECK-A: bl bar
; CHECK-A: sub sp, r11, #20
; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
; CHECK-A-THUMB: push.w {r0, r1, r2, r3, r4, r7, r12, lr}
; CHECK-A-THUMB: add r7, sp, #20
; CHECK-A-THUMB: mov r4, sp
-; CHECK-A-THUMB: bic r4, r4, #7
+; CHECK-A-THUMB: bfc r4, #0, #3
; CHECK-A-THUMB: bl bar
; CHECK-A-THUMB: sub.w r4, r7, #20
; CHECK-A-THUMB: mov sp, r4
; CHECK-M: push.w {r4, r10, r11, lr}
; CHECK-M: add.w r11, sp, #8
; CHECK-M: mov r4, sp
-; CHECK-M: bic r4, r4, #7
+; CHECK-M: bfc r4, #0, #3
; CHECK-M: mov sp, r4
; CHECK-M: bl _bar
; CHECK-M: sub.w r4, r11, #8
; 32 to get past r0, r1, ..., r7
; CHECK-A: add r11, sp, #32
; CHECK-A: sub sp, sp, #{{[0-9]+}}
-; CHECK-A: bic sp, sp, #7
+; CHECK-A: bfc sp, #0, #3
; [...]
; 32 must match above
; CHECK-A: sub sp, r11, #32
; CHECK-A: push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
; CHECK-A: add r11, sp, #44
; CHECK-A: sub sp, sp, #{{[0-9]+}}
-; CHECK-A: bic sp, sp, #7
+; CHECK-A: bfc sp, #0, #3
; [...]
; CHECK-A: sub sp, r11, #44
; CHECK-A: pop {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
; CHECK-A: add r11, sp, #20
; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
-; CHECK-A: bic sp, sp, #7
+; CHECK-A: bfc sp, #0, #3
; [...]
; CHECK-A: sub sp, r11, #20
; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
; CHECK-A: add r11, sp, #20
; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
-; CHECK-A: bic sp, sp, #7
+; CHECK-A: bfc sp, #0, #3
; [...]
; CHECK-A: sub sp, r11, #20
; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
define void @aaa(%quuz* %this, i8* %block) {
; CHECK-LABEL: aaa:
-; CHECK: bic {{.*}}, #15
+; CHECK: bfc {{.*}}, #0, #4
; CHECK: vst1.64 {{.*}}sp:128
; CHECK: vld1.64 {{.*}}sp:128
entry:
--- /dev/null
+; RUN: llc -verify-machineinstrs < %s -mtriple=armv4t | FileCheck %s -check-prefix=CHECK-v4A32
+; RUN: llc -verify-machineinstrs < %s -mtriple=armv7a | FileCheck %s -check-prefix=CHECK-v7A32
+; RUN: llc -verify-machineinstrs < %s -mtriple=thumbv7a | FileCheck %s -check-prefix=CHECK-THUMB2
+; FIXME: There are no tests for Thumb1 since dynamic stack alignment is not supported for
+; Thumb1.
+
+define i32 @f_bic_can_be_used_align() nounwind {
+entry:
+; CHECK-LABEL: f_bic_can_be_used_align:
+; CHECK-v7A32: bfc sp, #0, #8
+; CHECK-v4A32: bic sp, sp, #255
+; CHECK-THUMB2: mov r4, sp
+; CHECK-THUMB2-NEXT: bfc r4, #0, #8
+; CHECK-THUMB2-NEXT: mov sp, r4
+ %x = alloca i32, align 256
+ store volatile i32 0, i32* %x, align 256
+ ret i32 0
+}
+
+define i32 @f_too_large_for_bic_align() nounwind {
+entry:
+; CHECK-LABEL: f_too_large_for_bic_align:
+; CHECK-v7A32: bfc sp, #0, #9
+; CHECK-v4A32: lsr sp, sp, #9
+; CHECK-v4A32: lsl sp, sp, #9
+; CHECK-THUMB2: mov r4, sp
+; CHECK-THUMB2-NEXT: bfc r4, #0, #9
+; CHECK-THUMB2-NEXT: mov sp, r4
+ %x = alloca i32, align 512
+ store volatile i32 0, i32* %x, align 512
+ ret i32 0
+}
+
+define i8* @f_alignedDPRCS2Spills(double* %d) #0 {
+entry:
+; CHECK-LABEL: f_too_large_for_bic_align:
+; CHECK-v7A32: bfc sp, #0, #12
+; CHECK-v4A32: lsr sp, sp, #12
+; CHECK-v4A32: lsl sp, sp, #12
+; CHECK-THUMB2: bfc r4, #0, #12
+; CHECK-THUMB2-NEXT: mov sp, r4
+ %a = alloca i8, align 4096
+ %0 = load double* %d, align 4
+ %arrayidx1 = getelementptr inbounds double* %d, i32 1
+ %1 = load double* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds double* %d, i32 2
+ %2 = load double* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds double* %d, i32 3
+ %3 = load double* %arrayidx3, align 4
+ %arrayidx4 = getelementptr inbounds double* %d, i32 4
+ %4 = load double* %arrayidx4, align 4
+ %arrayidx5 = getelementptr inbounds double* %d, i32 5
+ %5 = load double* %arrayidx5, align 4
+ %arrayidx6 = getelementptr inbounds double* %d, i32 6
+ %6 = load double* %arrayidx6, align 4
+ %arrayidx7 = getelementptr inbounds double* %d, i32 7
+ %7 = load double* %arrayidx7, align 4
+ %arrayidx8 = getelementptr inbounds double* %d, i32 8
+ %8 = load double* %arrayidx8, align 4
+ %arrayidx9 = getelementptr inbounds double* %d, i32 9
+ %9 = load double* %arrayidx9, align 4
+ %arrayidx10 = getelementptr inbounds double* %d, i32 10
+ %10 = load double* %arrayidx10, align 4
+ %arrayidx11 = getelementptr inbounds double* %d, i32 11
+ %11 = load double* %arrayidx11, align 4
+ %arrayidx12 = getelementptr inbounds double* %d, i32 12
+ %12 = load double* %arrayidx12, align 4
+ %arrayidx13 = getelementptr inbounds double* %d, i32 13
+ %13 = load double* %arrayidx13, align 4
+ %arrayidx14 = getelementptr inbounds double* %d, i32 14
+ %14 = load double* %arrayidx14, align 4
+ %arrayidx15 = getelementptr inbounds double* %d, i32 15
+ %15 = load double* %arrayidx15, align 4
+ %arrayidx16 = getelementptr inbounds double* %d, i32 16
+ %16 = load double* %arrayidx16, align 4
+ %arrayidx17 = getelementptr inbounds double* %d, i32 17
+ %17 = load double* %arrayidx17, align 4
+ %arrayidx18 = getelementptr inbounds double* %d, i32 18
+ %18 = load double* %arrayidx18, align 4
+ %arrayidx19 = getelementptr inbounds double* %d, i32 19
+ %19 = load double* %arrayidx19, align 4
+ %arrayidx20 = getelementptr inbounds double* %d, i32 20
+ %20 = load double* %arrayidx20, align 4
+ %arrayidx21 = getelementptr inbounds double* %d, i32 21
+ %21 = load double* %arrayidx21, align 4
+ %arrayidx22 = getelementptr inbounds double* %d, i32 22
+ %22 = load double* %arrayidx22, align 4
+ %arrayidx23 = getelementptr inbounds double* %d, i32 23
+ %23 = load double* %arrayidx23, align 4
+ %arrayidx24 = getelementptr inbounds double* %d, i32 24
+ %24 = load double* %arrayidx24, align 4
+ %arrayidx25 = getelementptr inbounds double* %d, i32 25
+ %25 = load double* %arrayidx25, align 4
+ %arrayidx26 = getelementptr inbounds double* %d, i32 26
+ %26 = load double* %arrayidx26, align 4
+ %arrayidx27 = getelementptr inbounds double* %d, i32 27
+ %27 = load double* %arrayidx27, align 4
+ %arrayidx28 = getelementptr inbounds double* %d, i32 28
+ %28 = load double* %arrayidx28, align 4
+ %arrayidx29 = getelementptr inbounds double* %d, i32 29
+ %29 = load double* %arrayidx29, align 4
+ %div = fdiv double %29, %28
+ %div30 = fdiv double %div, %27
+ %div31 = fdiv double %div30, %26
+ %div32 = fdiv double %div31, %25
+ %div33 = fdiv double %div32, %24
+ %div34 = fdiv double %div33, %23
+ %div35 = fdiv double %div34, %22
+ %div36 = fdiv double %div35, %21
+ %div37 = fdiv double %div36, %20
+ %div38 = fdiv double %div37, %19
+ %div39 = fdiv double %div38, %18
+ %div40 = fdiv double %div39, %17
+ %div41 = fdiv double %div40, %16
+ %div42 = fdiv double %div41, %15
+ %div43 = fdiv double %div42, %14
+ %div44 = fdiv double %div43, %13
+ %div45 = fdiv double %div44, %12
+ %div46 = fdiv double %div45, %11
+ %div47 = fdiv double %div46, %10
+ %div48 = fdiv double %div47, %9
+ %div49 = fdiv double %div48, %8
+ %div50 = fdiv double %div49, %7
+ %div51 = fdiv double %div50, %6
+ %div52 = fdiv double %div51, %5
+ %div53 = fdiv double %div52, %4
+ %div54 = fdiv double %div53, %3
+ %div55 = fdiv double %div54, %2
+ %div56 = fdiv double %div55, %1
+ %div57 = fdiv double %div56, %0
+ %div58 = fdiv double %0, %1
+ %div59 = fdiv double %div58, %2
+ %div60 = fdiv double %div59, %3
+ %div61 = fdiv double %div60, %4
+ %div62 = fdiv double %div61, %5
+ %div63 = fdiv double %div62, %6
+ %div64 = fdiv double %div63, %7
+ %div65 = fdiv double %div64, %8
+ %div66 = fdiv double %div65, %9
+ %div67 = fdiv double %div66, %10
+ %div68 = fdiv double %div67, %11
+ %div69 = fdiv double %div68, %12
+ %div70 = fdiv double %div69, %13
+ %div71 = fdiv double %div70, %14
+ %div72 = fdiv double %div71, %15
+ %div73 = fdiv double %div72, %16
+ %div74 = fdiv double %div73, %17
+ %div75 = fdiv double %div74, %18
+ %div76 = fdiv double %div75, %19
+ %div77 = fdiv double %div76, %20
+ %div78 = fdiv double %div77, %21
+ %div79 = fdiv double %div78, %22
+ %div80 = fdiv double %div79, %23
+ %div81 = fdiv double %div80, %24
+ %div82 = fdiv double %div81, %25
+ %div83 = fdiv double %div82, %26
+ %div84 = fdiv double %div83, %27
+ %div85 = fdiv double %div84, %28
+ %div86 = fdiv double %div85, %29
+ %mul = fmul double %div57, %div86
+ %conv = fptosi double %mul to i32
+ %add.ptr = getelementptr inbounds i8* %a, i32 %conv
+ ret i8* %add.ptr
+}
;
; The caller-saved r4 is used as a scratch register for stack realignment.
; CHECK: push {r4, r7, lr}
-; CHECK: bic r4, r4, #7
+; CHECK: bfc r4, #0, #3
; CHECK: mov sp, r4
define void @f(double* nocapture %p) nounwind ssp {
entry:
; NEON: f
; NEON: push {r4, r7, lr}
; NEON: sub.w r4, sp, #64
-; NEON: bic r4, r4, #15
+; NEON: bfc r4, #0, #4
; Stack pointer must be updated before the spills.
; NEON: mov sp, r4
; NEON: vst1.64 {d8, d9, d10, d11}, [r4:128]!
; NEON: f7
; NEON: push {r4, r7, lr}
; NEON: sub.w r4, sp, #56
-; NEON: bic r4, r4, #15
+; NEON: bfc r4, #0, #4
; Stack pointer must be updated before the spills.
; NEON: mov sp, r4
; NEON: vst1.64 {d8, d9, d10, d11}, [r4:128]!
; NEON: push {r4, r7, lr}
; NEON: vpush {d12, d13, d14, d15}
; NEON: sub.w r4, sp, #24
-; NEON: bic r4, r4, #15
+; NEON: bfc r4, #0, #4
; Stack pointer must be updated before the spills.
; NEON: mov sp, r4
; NEON: vst1.64 {d8, d9}, [r4:128]
define void @aaa(%quuz* %this, i8* %block) {
; CHECK-LABEL: aaa:
-; CHECK: bic r4, r4, #15
+; CHECK: bfc r4, #0, #4
; CHECK: vst1.64 {{.*}}[{{.*}}:128]
; CHECK: vld1.64 {{.*}}[{{.*}}:128]
entry: