#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/OrderedBasicBlock.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/Transforms/Scalar.h"
DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
cl::desc("Disable the ARM Parallel DSP pass"));
+static cl::opt<unsigned>
+NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16),
+ cl::desc("Limit the number of loads analysed"));
+
namespace {
struct MulCandidate;
class Reduction;
SmallVector<Instruction*, 8> Writes;
LoadPairs.clear();
WideLoads.clear();
+ OrderedBasicBlock OrderedBB(BB);
// Collect loads and instruction that may write to memory. For now we only
// record loads which are simple, sign-extended and have a single user.
Loads.push_back(Ld);
}
+ if (Loads.empty() || Loads.size() > NumLoadLimit)
+ return false;
+
using InstSet = std::set<Instruction*>;
using DepMap = std::map<Instruction*, InstSet>;
DepMap RAWDeps;
// Record any writes that may alias a load.
const auto Size = LocationSize::unknown();
- for (auto Read : Loads) {
- for (auto Write : Writes) {
+ for (auto Write : Writes) {
+ for (auto Read : Loads) {
MemoryLocation ReadLoc =
MemoryLocation(Read->getPointerOperand(), Size);
if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
ModRefInfo::ModRef)))
continue;
- if (DT->dominates(Write, Read))
+ if (OrderedBB.dominates(Write, Read))
RAWDeps[Read].insert(Write);
}
}
// Check whether there's not a write between the two loads which would
// prevent them from being safely merged.
auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
- LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset;
- LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base;
+ LoadInst *Dominator = OrderedBB.dominates(Base, Offset) ? Base : Offset;
+ LoadInst *Dominated = OrderedBB.dominates(Base, Offset) ? Offset : Base;
if (RAWDeps.count(Dominated)) {
InstSet &WritesBefore = RAWDeps[Dominated];
for (auto Before : WritesBefore) {
// We can't move the second load backward, past a write, to merge
// with the first load.
- if (DT->dominates(Dominator, Before))
+ if (OrderedBB.dominates(Dominator, Before))
return false;
}
}
// Record base, offset load pairs.
for (auto *Base : Loads) {
for (auto *Offset : Loads) {
- if (Base == Offset)
+ if (Base == Offset || OffsetLoads.count(Offset))
continue;
if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) &&
return !R.getMulPairs().empty();
}
-
void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
- ++BasicBlock::iterator(InsertAfter));
+ BasicBlock::iterator(InsertAfter));
Instruction *Call = Builder.CreateCall(SMLAD, Args);
NumSMLAD++;
return Call;
};
- Instruction *InsertAfter = R.getRoot();
+ // Return the instruction after the dominated instruction.
+ auto GetInsertPoint = [this](Value *A, Value *B) {
+ assert((isa<Instruction>(A) || isa<Instruction>(B)) &&
+ "expected at least one instruction");
+
+ Value *V = nullptr;
+ if (!isa<Instruction>(A))
+ V = B;
+ else if (!isa<Instruction>(B))
+ V = A;
+ else
+ V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A;
+
+ return &*++BasicBlock::iterator(cast<Instruction>(V));
+ };
+
Value *Acc = R.getAccumulator();
// For any muls that were discovered but not paired, accumulate their values
// as before.
- IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
- ++BasicBlock::iterator(InsertAfter));
+ IRBuilder<NoFolder> Builder(R.getRoot()->getParent());
MulCandList &MulCands = R.getMuls();
for (auto &MulCand : MulCands) {
if (MulCand->Paired)
continue;
- Value *Mul = MulCand->Root;
+ Instruction *Mul = cast<Instruction>(MulCand->Root);
LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");
if (R.getType() != Mul->getType()) {
assert(R.is64Bit() && "expected 64-bit result");
- Mul = Builder.CreateSExt(Mul, R.getType());
+ Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul));
+ Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType()));
}
if (!Acc) {
continue;
}
+ // If Acc is the original incoming value to the reduction, it could be a
+ // phi. But the phi will dominate Mul, meaning that Mul will be the
+ // insertion point.
+ Builder.SetInsertPoint(GetInsertPoint(Mul, Acc));
Acc = Builder.CreateAdd(Mul, Acc);
- InsertAfter = cast<Instruction>(Acc);
}
if (!Acc) {
Acc = Builder.CreateSExt(Acc, R.getType());
}
+ // Roughly sort the mul pairs in their program order.
+ OrderedBasicBlock OrderedBB(R.getRoot()->getParent());
+ llvm::sort(R.getMulPairs(), [&OrderedBB](auto &PairA, auto &PairB) {
+ const Instruction *A = PairA.first->Root;
+ const Instruction *B = PairB.first->Root;
+ return OrderedBB.dominates(A, B);
+ });
+
IntegerType *Ty = IntegerType::get(M->getContext(), 32);
for (auto &Pair : R.getMulPairs()) {
MulCandidate *LHSMul = Pair.first;
LoadInst *WideRHS = WideLoads.count(BaseRHS) ?
WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty);
+ Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS);
+ InsertAfter = GetInsertPoint(InsertAfter, Acc);
Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
- InsertAfter = cast<Instruction>(Acc);
}
R.UpdateRoot(cast<Instruction>(Acc));
}
ret i32 %res
}
+; TODO: Four smlads should be generated here, but mul.0 and mul.3 remain as
+; scalars.
+; CHECK-LABEL: num_load_limit
+; CHECK: call i32 @llvm.arm.smlad
+; CHECK: call i32 @llvm.arm.smlad
+; CHECK: call i32 @llvm.arm.smlad
+; CHECK-NOT: call i32 @llvm.arm.smlad
+define i32 @num_load_limit(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.0, %sext.b.0
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %mul.1 = mul i32 %sext.a.1, %sext.b.1
+ %add.0 = add i32 %mul.0, %mul.1
+
+ %addr.a.2 = getelementptr i16, i16* %a, i32 2
+ %addr.b.2 = getelementptr i16, i16* %b, i32 2
+ %ld.a.2 = load i16, i16* %addr.a.2
+ %sext.a.2 = sext i16 %ld.a.2 to i32
+ %ld.b.2 = load i16, i16* %addr.b.2
+ %sext.b.2 = sext i16 %ld.b.2 to i32
+ %mul.2 = mul i32 %sext.a.0, %sext.b.0
+ %addr.a.3 = getelementptr i16, i16* %a, i32 3
+ %addr.b.3 = getelementptr i16, i16* %b, i32 3
+ %ld.a.3 = load i16, i16* %addr.a.3
+ %sext.a.3 = sext i16 %ld.a.3 to i32
+ %ld.b.3 = load i16, i16* %addr.b.3
+ %sext.b.3 = sext i16 %ld.b.3 to i32
+ %mul.3 = mul i32 %sext.a.1, %sext.b.3
+ %add.3 = add i32 %mul.2, %mul.3
+
+ %addr.a.4 = getelementptr i16, i16* %a, i32 4
+ %addr.b.4 = getelementptr i16, i16* %b, i32 4
+ %ld.a.4 = load i16, i16* %addr.a.4
+ %sext.a.4 = sext i16 %ld.a.4 to i32
+ %ld.b.4 = load i16, i16* %addr.b.4
+ %sext.b.4 = sext i16 %ld.b.4 to i32
+ %mul.4 = mul i32 %sext.a.4, %sext.b.4
+ %addr.a.5 = getelementptr i16, i16* %a, i32 5
+ %addr.b.5 = getelementptr i16, i16* %b, i32 5
+ %ld.a.5 = load i16, i16* %addr.a.5
+ %sext.a.5 = sext i16 %ld.a.5 to i32
+ %ld.b.5 = load i16, i16* %addr.b.5
+ %sext.b.5 = sext i16 %ld.b.5 to i32
+ %mul.5 = mul i32 %sext.a.5, %sext.b.5
+ %add.5 = add i32 %mul.4, %mul.5
+
+ %addr.a.6 = getelementptr i16, i16* %a, i32 6
+ %addr.b.6 = getelementptr i16, i16* %b, i32 6
+ %ld.a.6 = load i16, i16* %addr.a.6
+ %sext.a.6 = sext i16 %ld.a.6 to i32
+ %ld.b.6 = load i16, i16* %addr.b.6
+ %sext.b.6 = sext i16 %ld.b.6 to i32
+ %mul.6 = mul i32 %sext.a.6, %sext.b.6
+ %addr.a.7 = getelementptr i16, i16* %a, i32 7
+ %addr.b.7 = getelementptr i16, i16* %b, i32 7
+ %ld.a.7 = load i16, i16* %addr.a.7
+ %sext.a.7 = sext i16 %ld.a.7 to i32
+ %ld.b.7 = load i16, i16* %addr.b.7
+ %sext.b.7 = sext i16 %ld.b.7 to i32
+ %mul.7 = mul i32 %sext.a.7, %sext.b.7
+ %add.7 = add i32 %mul.6, %mul.7
+
+ %add.10 = add i32 %add.7, %add.5
+ %add.11 = add i32 %add.3, %add.0
+ %add.12 = add i32 %add.10, %add.11
+ %res = add i32 %add.12, %acc
+ ret i32 %res
+}
+
+; CHECK-LABEL: too_many_loads
+; CHECK-NOT: call i32 @llvm.arm.smlad
+define i32 @too_many_loads(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.0, %sext.b.0
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %mul.1 = mul i32 %sext.a.1, %sext.b.1
+ %add.0 = add i32 %mul.0, %mul.1
+
+ %addr.a.2 = getelementptr i16, i16* %a, i32 2
+ %addr.b.2 = getelementptr i16, i16* %b, i32 2
+ %ld.a.2 = load i16, i16* %addr.a.2
+ %sext.a.2 = sext i16 %ld.a.2 to i32
+ %ld.b.2 = load i16, i16* %addr.b.2
+ %sext.b.2 = sext i16 %ld.b.2 to i32
+ %mul.2 = mul i32 %sext.a.0, %sext.b.0
+ %addr.a.3 = getelementptr i16, i16* %a, i32 3
+ %addr.b.3 = getelementptr i16, i16* %b, i32 3
+ %ld.a.3 = load i16, i16* %addr.a.3
+ %sext.a.3 = sext i16 %ld.a.3 to i32
+ %ld.b.3 = load i16, i16* %addr.b.3
+ %sext.b.3 = sext i16 %ld.b.3 to i32
+ %mul.3 = mul i32 %sext.a.1, %sext.b.3
+ %add.3 = add i32 %mul.2, %mul.3
+
+ %addr.a.4 = getelementptr i16, i16* %a, i32 4
+ %addr.b.4 = getelementptr i16, i16* %b, i32 4
+ %ld.a.4 = load i16, i16* %addr.a.4
+ %sext.a.4 = sext i16 %ld.a.4 to i32
+ %ld.b.4 = load i16, i16* %addr.b.4
+ %sext.b.4 = sext i16 %ld.b.4 to i32
+ %mul.4 = mul i32 %sext.a.4, %sext.b.4
+ %addr.a.5 = getelementptr i16, i16* %a, i32 5
+ %addr.b.5 = getelementptr i16, i16* %b, i32 5
+ %ld.a.5 = load i16, i16* %addr.a.5
+ %sext.a.5 = sext i16 %ld.a.5 to i32
+ %ld.b.5 = load i16, i16* %addr.b.5
+ %sext.b.5 = sext i16 %ld.b.5 to i32
+ %mul.5 = mul i32 %sext.a.5, %sext.b.5
+ %add.5 = add i32 %mul.4, %mul.5
+
+ %addr.a.6 = getelementptr i16, i16* %a, i32 6
+ %addr.b.6 = getelementptr i16, i16* %b, i32 6
+ %ld.a.6 = load i16, i16* %addr.a.6
+ %sext.a.6 = sext i16 %ld.a.6 to i32
+ %ld.b.6 = load i16, i16* %addr.b.6
+ %sext.b.6 = sext i16 %ld.b.6 to i32
+ %mul.6 = mul i32 %sext.a.6, %sext.b.6
+ %addr.a.7 = getelementptr i16, i16* %a, i32 7
+ %addr.b.7 = getelementptr i16, i16* %b, i32 7
+ %ld.a.7 = load i16, i16* %addr.a.7
+ %sext.a.7 = sext i16 %ld.a.7 to i32
+ %ld.b.7 = load i16, i16* %addr.b.7
+ %sext.b.7 = sext i16 %ld.b.7 to i32
+ %mul.7 = mul i32 %sext.a.7, %sext.b.7
+ %add.7 = add i32 %mul.6, %mul.7
+
+ %addr.a.8 = getelementptr i16, i16* %a, i32 7
+ %addr.b.8 = getelementptr i16, i16* %b, i32 7
+ %ld.a.8 = load i16, i16* %addr.a.8
+ %sext.a.8 = sext i16 %ld.a.8 to i32
+ %ld.b.8 = load i16, i16* %addr.b.8
+ %sext.b.8 = sext i16 %ld.b.8 to i32
+ %mul.8 = mul i32 %sext.a.8, %sext.b.8
+
+ %add.10 = add i32 %add.7, %add.5
+ %add.11 = add i32 %add.3, %add.0
+ %add.12 = add i32 %add.10, %add.11
+ %add.13 = add i32 %add.12, %acc
+ %res = add i32 %add.13, %mul.8
+ ret i32 %res
+}
-; RUN: llc -mtriple=thumbv7em -mcpu=cortex-m4 -O3 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv7em -mcpu=cortex-m4 -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LLC
+; RUN: opt -S -mtriple=armv7-a -arm-parallel-dsp -dce %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OPT
; TODO: Think we should be able to use smlsdx/smlsldx here.
; CHECK-LABEL: complex_dot_prod
-; CHECK: smulbb
-; CHECK: smultt
-; CHECK: smlalbb
-; CHECK: smultt
-; CHECK: smlalbb
-; CHECK: smultt
-; CHECK: smlalbb
-; CHECK: smultt
-; CHECK: smlaldx
-; CHECK: smlaldx
-; CHECK: smlaldx
-; CHECK: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-LLC: smlaldx
+; CHECK-LLC: smulbb
+; CHECK-LLC: smultt
+; CHECK-LLC: smlaldx
+; CHECK-LLC: smlalbb
+; CHECK-LLC: smultt
+; CHECK-LLC: smlalbb
+; CHECK-LLC: smultt
+; CHECK-LLC: smlaldx
+; CHECK-LLC: smlalbb
+; CHECK-LLC: smultt
+; CHECK-LLC: smlaldx
+; CHECK-LCC: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+
+; CHECK-OPT: [[ADDR_A:%[^ ]+]] = bitcast i16* %pSrcA to i32*
+; CHECK-OPT: [[A:%[^ ]+]] = load i32, i32* [[ADDR_A]], align 2
+; CHECK-OPT: [[ADDR_A_2:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 2
+; CHECK-OPT: [[ADDR_B:%[^ ]+]] = bitcast i16* %pSrcB to i32*
+; CHECK-OPT: [[B:%[^ ]+]] = load i32, i32* [[ADDR_B]], align 2
+; CHECK-OPT: [[ACC0:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A]], i32 [[B]], i64 0)
+; CHECK-OPT: [[ADDR_B_2:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 2
+; CHECK-OPT: [[CAST_ADDR_A_2:%[^ ]+]] = bitcast i16* [[ADDR_A_2]] to i32*
+; CHECK-OPT: [[A_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_2]], align 2
+; CHECK-OPT: [[ADDR_A_4:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 4
+; CHECK-OPT: [[CAST_ADDR_B_2:%[^ ]+]] = bitcast i16* [[ADDR_B_2]] to i32*
+; CHECK-OPT: [[B_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_2]], align 2
+; CHECK-OPT: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A_2]], i32 [[B_2]], i64 [[ACC0]])
+; CHECK-OPT: [[ADDR_B_4:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 4
+; CHECK-OPT: [[CAST_ADDR_A_4:%[^ ]+]] = bitcast i16* [[ADDR_A_4]] to i32*
+; CHECK-OPT: [[A_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_4]], align 2
+; CHECK-OPT: [[ADDR_A_6:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 6
+; CHECK-OPT: [[CAST_ADDR_B_4:%[^ ]+]] = bitcast i16* [[ADDR_B_4]] to i32*
+; CHECK-OPT: [[B_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_4]], align 2
+; CHECK-OPT: [[ACC2:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A_4]], i32 [[B_4]], i64 [[ACC1]])
+; CHECK-OPT: [[ADDR_B_6:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 6
+; CHECK-OPT: [[CAST_ADDR_A_6:%[^ ]+]] = bitcast i16* [[ADDR_A_6]] to i32*
+; CHECK-OPT: [[A_6:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_6]], align 2
+; CHECK-OPT: [[CAST_ADDR_B_6:%[^ ]+]] = bitcast i16* [[ADDR_B_6]] to i32*
+; CHECK-OPT: [[B_6:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_6]], align 2
+; CHECK-OPT: call i64 @llvm.arm.smlaldx(i32 [[A_6]], i32 [[B_6]], i64 [[ACC2]])
+
define dso_local arm_aapcscc void @complex_dot_prod(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i32* nocapture %realResult, i32* nocapture %imagResult) {
entry:
%incdec.ptr = getelementptr inbounds i16, i16* %pSrcA, i32 1
%conv85 = sext i32 %mul84 to i64
%sub86 = sub nsw i64 %add76, %conv85
%mul89 = mul nsw i32 %conv73, %conv82
- %conv90 = sext i32 %mul89 to i64
+ %conv90 = sext i32 %mul89 to i64
%add81 = add nsw i64 %add67, %conv90
%add91 = add nsw i64 %add81, %conv80
%16 = lshr i64 %sub86, 6
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
-; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]])
define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) {
entry:
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc
; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
-; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc
; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]])
define i64 @exchange_multi_use_64_1(i16* %a, i16* %b, i64 %acc) {
entry:
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc
; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
-; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc
; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]])
define i64 @exchange_multi_use_64_2(i16* %a, i16* %b, i64 %acc) {
entry:
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
-; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]])
define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) {
entry:
; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
-; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 0)
-; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 [[ACC]])
+; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 0)
+; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]])
define i64 @exchange_multi_use_64_3(i16* %a, i16* %b, i64 %acc) {
entry:
%addr.a.1 = getelementptr i16, i16* %a, i32 1
; CHECK: [[BIJ_LD:%[^ ]+]] = load i32, i32* [[BIJ_CAST]], align 2
; CHECK: [[CIJ_CAST:%[^ ]+]] = bitcast i16* [[CIJ]] to i32*
; CHECK: [[CIJ_LD:%[^ ]+]] = load i32, i32* [[CIJ_CAST]], align 2
+; CHECK: [[SMLAD0:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_LD]], i32 [[BIJ_LD]], i32 0)
; CHECK: [[BIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[BIJ]], i32 2
; CHECK: [[BIJ_2_CAST:%[^ ]+]] = bitcast i16* [[BIJ_2]] to i32*
; CHECK: [[BIJ_2_LD:%[^ ]+]] = load i32, i32* [[BIJ_2_CAST]], align 2
; CHECK: [[CIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[CIJ]], i32 2
; CHECK: [[CIJ_2_CAST:%[^ ]+]] = bitcast i16* [[CIJ_2]] to i32*
; CHECK: [[CIJ_2_LD:%[^ ]+]] = load i32, i32* [[CIJ_2_CAST]], align 2
-; CHECK: [[SMLAD0:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]], i32 0)
-; CHECK: [[SMLAD1:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_LD]], i32 [[BIJ_LD]], i32 [[SMLAD0]])
+; CHECK: [[SMLAD1:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]], i32 [[SMLAD0]])
; CHECK: store i32 [[SMLAD1]], i32* %arrayidx, align 4
define void @full_unroll(i32* noalias nocapture %a, i16** noalias nocapture readonly %b, i16** noalias nocapture readonly %c, i32 %N) {
; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-LE-NEXT: ldr lr, [r3, #2]!
; CHECK-LE-NEXT: ldr r4, [r2, #2]!
-; CHECK-LE-NEXT: sxtah r1, r1, lr
; CHECK-LE-NEXT: subs r0, #1
; CHECK-LE-NEXT: smlad r12, r4, lr, r12
+; CHECK-LE-NEXT: sxtah r1, r1, lr
; CHECK-LE-NEXT: bne .LBB0_2
; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-LE-NEXT: add.w r0, r12, r1
define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
; CHECK-LE-LABEL: mul_top_user:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .save {r4, r5, r7, lr}
-; CHECK-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-LE-NEXT: .save {r4, lr}
+; CHECK-LE-NEXT: push {r4, lr}
; CHECK-LE-NEXT: cmp r0, #1
; CHECK-LE-NEXT: blt .LBB2_4
; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-LE-NEXT: sub.w lr, r2, #2
+; CHECK-LE-NEXT: subs r2, #2
; CHECK-LE-NEXT: subs r3, #2
; CHECK-LE-NEXT: mov.w r12, #0
; CHECK-LE-NEXT: movs r1, #0
; CHECK-LE-NEXT: .p2align 2
; CHECK-LE-NEXT: .LBB2_2: @ %for.body
; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-LE-NEXT: ldr r2, [lr, #2]!
-; CHECK-LE-NEXT: ldr r4, [r3, #2]!
-; CHECK-LE-NEXT: asrs r5, r2, #16
-; CHECK-LE-NEXT: smlad r12, r2, r4, r12
+; CHECK-LE-NEXT: ldr lr, [r3, #2]!
+; CHECK-LE-NEXT: ldr r4, [r2, #2]!
; CHECK-LE-NEXT: subs r0, #1
-; CHECK-LE-NEXT: mul r1, r5, r1
+; CHECK-LE-NEXT: smlad r12, r4, lr, r12
+; CHECK-LE-NEXT: asr.w r4, r4, #16
+; CHECK-LE-NEXT: mul r1, r4, r1
; CHECK-LE-NEXT: bne .LBB2_2
; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-LE-NEXT: add.w r0, r12, r1
-; CHECK-LE-NEXT: pop {r4, r5, r7, pc}
+; CHECK-LE-NEXT: pop {r4, pc}
; CHECK-LE-NEXT: .LBB2_4:
; CHECK-LE-NEXT: mov.w r12, #0
; CHECK-LE-NEXT: movs r1, #0
; CHECK-LE-NEXT: add.w r0, r12, r1
-; CHECK-LE-NEXT: pop {r4, r5, r7, pc}
+; CHECK-LE-NEXT: pop {r4, pc}
;
; CHECK-BE-LABEL: mul_top_user:
; CHECK-BE: @ %bb.0: @ %entry
define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
; CHECK-LE-LABEL: and_user:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .save {r4, r5, r7, lr}
-; CHECK-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-LE-NEXT: .save {r4, lr}
+; CHECK-LE-NEXT: push {r4, lr}
; CHECK-LE-NEXT: cmp r0, #1
; CHECK-LE-NEXT: blt .LBB3_4
; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-LE-NEXT: ldr r2, [r3, #2]!
; CHECK-LE-NEXT: ldr r4, [lr, #2]!
-; CHECK-LE-NEXT: uxth r5, r2
-; CHECK-LE-NEXT: smlad r12, r4, r2, r12
; CHECK-LE-NEXT: subs r0, #1
-; CHECK-LE-NEXT: mul r1, r5, r1
+; CHECK-LE-NEXT: smlad r12, r4, r2, r12
+; CHECK-LE-NEXT: uxth r2, r2
+; CHECK-LE-NEXT: mul r1, r2, r1
; CHECK-LE-NEXT: bne .LBB3_2
; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-LE-NEXT: add.w r0, r12, r1
-; CHECK-LE-NEXT: pop {r4, r5, r7, pc}
+; CHECK-LE-NEXT: pop {r4, pc}
; CHECK-LE-NEXT: .LBB3_4:
; CHECK-LE-NEXT: mov.w r12, #0
; CHECK-LE-NEXT: movs r1, #0
; CHECK-LE-NEXT: add.w r0, r12, r1
-; CHECK-LE-NEXT: pop {r4, r5, r7, pc}
+; CHECK-LE-NEXT: pop {r4, pc}
;
; CHECK-BE-LABEL: and_user:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc)
; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32*
; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]]
; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32*
; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
-; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 %acc)
-; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC]])
+; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 [[ACC]])
; CHECK: ret i32 [[RES]]
define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) {
entry:
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 %acc)
; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32*
; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]]
; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32*
; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
-; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 %acc)
-; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]])
+; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 [[ACC]])
; CHECK: ret i64 [[RES]]
define i64 @overlap_64_1(i16* %a, i16* %b, i64 %acc) {
entry:
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc)
; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
-; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc)
-; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]])
+; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]])
+; CHECK: ret i32 [[RES]]
define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) {
entry:
%addr.a.1 = getelementptr i16, i16* %a, i32 1
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc)
; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
-; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc)
-; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]])
+; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]])
+; CHECK: ret i32 [[RES]]
define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) {
entry:
%addr.a.1 = getelementptr i16, i16* %a, i32 1
; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2
; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32*
; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2
+; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ADD0]])
; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5
; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32*
; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2
; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4
; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32*
; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2
-; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ADD0]])
-; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]])
+; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ACC]])
; CHECK: ret i32 [[RES]]
define i32 @first_mul_invalid(i16* nocapture readonly %in, i16* nocapture readonly %b) {
entry:
; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2
; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32*
; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2
+; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[MUL0]])
; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5
; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32*
; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2
; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4
; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32*
; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2
-; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[MUL0]])
-; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]])
+; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ACC]])
; CHECK: ret i32 [[RES]]
define i32 @with_no_acc_input(i16* nocapture readonly %in, i16* nocapture readonly %b) {
entry:
; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2
; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32*
; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2
+; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i64 [[ADD0]])
; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5
; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32*
; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2
; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4
; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32*
; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2
-; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i64 [[ADD0]])
-; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i64 [[ACC]])
+; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i64 [[ACC]])
; CHECK: ret i64 [[RES]]
define i64 @with_64bit_acc(i16* nocapture readonly %in, i16* nocapture readonly %b) {
entry:
; CHECK: [[Y_1:%[^ ]+]] = load i16, i16* [[ADDR_Y_MINUS_1]], align 2
; CHECK: [[SEXT_Y_1:%[^ ]+]] = sext i16 [[Y_1]] to i32
; CHECK: [[UNPAIRED:%[^ ]+]] = mul nsw i32 [[SEXT_Y_1]], [[SEXT_X_1]]
+; CHECK: [[SEXT:%[^ ]+]] = sext i32 [[UNPAIRED]] to i64
+; CHECK: [[ACC:%[^ ]+]] = add i64 [[SEXT]], [[ADD_1]]
; CHECK: [[ADDR_X_PLUS_2:%[^ ]+]] = bitcast i16* [[X_PLUS_2]] to i32*
; CHECK: [[X_2:%[^ ]+]] = load i32, i32* [[ADDR_X_PLUS_2]], align 2
; CHECK: [[Y_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %py.8757.unr, i32 -3
; CHECK: [[ADDR_Y_MINUS_3:%[^ ]+]] = bitcast i16* [[Y_MINUS_3]] to i32*
; CHECK: [[Y_3:%[^ ]+]] = load i32, i32* [[ADDR_Y_MINUS_3]], align 2
-; CHECK: [[SEXT:%[^ ]+]] = sext i32 [[UNPAIRED]] to i64
-; CHECK: [[ACC:%[^ ]+]] = add i64 [[SEXT]], [[ADD_1]]
; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[Y_3]], i32 [[X_2]], i64 [[ACC]])
; CHECK: ret i64 [[RES]]
define i64 @with_64bit_add_acc(i16* nocapture readonly %px.10756.unr, i16* nocapture readonly %py.8757.unr, i32 %acc) {
; CHECK: [[V16:%[0-9]+]] = load i32, i32* [[V15]], align 2
; CHECK: [[V8:%[0-9]+]] = bitcast i16* %arrayidx8 to i32*
; CHECK: [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2
+; CHECK: [[ACC:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054)
; CHECK: [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32*
; CHECK: [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2
-; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 %mac1{{\.}}054)
-; CHECK: [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 [[V12]])
+; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 [[ACC]])
;
; And we don't want to see a 3rd smlad:
; CHECK-NOT: call i32 @llvm.arm.smlad
; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
+; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN21]], i32 [[IN10]], i32 [[ACC0]])
+
; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32*
; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2
; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32*
; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2
-; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN23]], i32 [[IN12]], i32 [[ACC0]])
-; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN21]], i32 [[IN10]], i32 [[ACC1]])
+; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN23]], i32 [[IN12]], i32 [[ACC1]])
; CHECK-NOT: call i32 @llvm.arm.smlad
; CHECK-UNSUPPORTED-NOT: call i32 @llvm.arm.smlad
; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2
; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*
; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2
+; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2]], i32 [[IN1_2]], i32 [[ACC0]])
; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32*
; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*
; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2
-; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC0]])
-; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN2]], i32 [[IN1_2]], i32 [[ACC1]])
+; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC1]])
; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4
; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4
; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
+; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC0]])
; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32*
; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2
; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32*
; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2
-; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]])
-; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]])
+; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC1]])
; CHECK-NOT: call i64 @llvm.arm.smlad
; CHECK-UNSUPPORTED-NOT: call i64 @llvm.arm.smlad
; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2
; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*
; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2
+; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]])
; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32*
; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*
; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2
-
-; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]])
-; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]])
+; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC1]])
; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4
; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4
; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
+; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC0]])
; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32*
; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2
; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32*
; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2
-; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]])
-; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]])
+; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC1]])
; CHECK-NOT: call i64 @llvm.arm.smlad
; CHECK-UNSUPPORTED-NOT: call i64 @llvm.arm.smlad
; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2
; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*
; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2
+; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]])
; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32*
; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*
; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2
-
-; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]])
-; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]])
+; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC1]])
; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4
; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4
; RUN: llc -O3 -mtriple=thumbv7em -mcpu=cortex-m4 %s -o - | FileCheck %s --check-prefix=CHECK-REG-PRESSURE
; RUN: llc -O3 -mtriple=thumbv7eb %s -o - | FileCheck %s --check-prefix=CHECK-UNSUPPORTED
-; RUN: llc -O3 -mtriple=thumbv8m.main -mattr=+dsp %s -o - | FileCheck %s --check-prefix=CHECK
+; RUN: llc -O3 -mtriple=thumbv8m.main -mattr=+dsp -arm-parallel-dsp-load-limit=20 %s -o - | FileCheck %s --check-prefix=CHECK
; CHECK-UNSUPPORTED-LABEL: unroll_n_jam_smlad
; CHECK-UNSUPPORTED-NOT: smlad r{{.}}
; CHECK-NOT: smlad r{{.*}}
; CHECK-REG-PRESSURE: .LBB0_1:
+; CHECK-REG-PRESSURE-NOT: call i32 @llvm.arm.smlad
; CHECK-REG-PRESSURE: ldr{{.*}}, [sp
; CHECK-REG-PRESSURE: ldr{{.*}}, [sp
; CHECK-REG-PRESSURE: ldr{{.*}}, [sp
; CHECK-REG-PRESSURE: ldr{{.*}}, [sp
; CHECK-REG-PRESSURE: ldr{{.*}}, [sp
-; CHECK-REG-PRESSURE: ldr{{.*}}, [sp
-; CHECK-REG-PRESSURE: ldr{{.*}}, [sp
+; CHECK-REG-PRESSURE-NOT: ldr{{.*}}, [sp
; CHECK-REG-PRESSURE: bne .LBB0_1
for.body: