-//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
+//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Pass.h"
#include "llvm/PassRegistry.h"
#include "llvm/PassSupport.h"
Value* RHS;
bool Exchange = false;
bool ReadOnly = true;
+ bool Paired = false;
SmallVector<LoadInst*, 2> VecLd; // Container for loads to widen.
MulCandidate(Instruction *I, Value *lhs, Value *rhs) :
}
LoadInst *getBaseLoad() const {
- return cast<LoadInst>(LHS);
+ return VecLd.front();
}
};
Value *Acc = nullptr;
MulCandList Muls;
MulPairList MulPairs;
- SmallPtrSet<Instruction*, 4> Adds;
+ SetVector<Instruction*> Adds;
public:
Reduction() = delete;
/// Record an Add instruction that is a part of the this reduction.
void InsertAdd(Instruction *I) { Adds.insert(I); }
- /// Record a MulCandidate, rooted at a Mul instruction, that is a part of
- /// this reduction.
- void InsertMul(Instruction *I, Value *LHS, Value *RHS) {
- Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
+ /// Create MulCandidates, each rooted at a Mul instruction, that is a part
+ /// of this reduction.
+ void InsertMuls() {
+ auto GetMulOperand = [](Value *V) -> Instruction* {
+ if (auto *SExt = dyn_cast<SExtInst>(V)) {
+ if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0)))
+ if (I->getOpcode() == Instruction::Mul)
+ return I;
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
+ if (I->getOpcode() == Instruction::Mul)
+ return I;
+ }
+ return nullptr;
+ };
+
+ auto InsertMul = [this](Instruction *I) {
+ Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0);
+ Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0);
+ Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
+ };
+
+ for (auto *Add : Adds) {
+ if (Add == Acc)
+ continue;
+ if (auto *Mul = GetMulOperand(Add->getOperand(0)))
+ InsertMul(Mul);
+ if (auto *Mul = GetMulOperand(Add->getOperand(1)))
+ InsertMul(Mul);
+ }
}
/// Add the incoming accumulator value, returns true if a value had not
/// Set two MulCandidates, rooted at muls, that can be executed as a single
/// parallel operation.
- void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1) {
+ void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1,
+ bool Exchange = false) {
+ LLVM_DEBUG(dbgs() << "Pairing:\n"
+ << *Mul0->Root << "\n"
+ << *Mul1->Root << "\n");
+ Mul0->Paired = true;
+ Mul1->Paired = true;
+ if (Exchange)
+ Mul1->Exchange = true;
MulPairs.push_back(std::make_pair(Mul0, Mul1));
}
Value *getAccumulator() { return Acc; }
/// Return the set of adds that comprise the reduction.
- SmallPtrSetImpl<Instruction*> &getAdds() { return Adds; }
+ SetVector<Instruction*> &getAdds() { return Adds; }
/// Return the MulCandidate, rooted at mul instruction, that comprise the
/// the reduction.
void UpdateRoot(Instruction *SMLAD) {
Root->replaceAllUsesWith(SMLAD);
}
+
+ void dump() {
+ LLVM_DEBUG(dbgs() << "Reduction:\n";
+ for (auto *Add : Adds)
+ LLVM_DEBUG(dbgs() << *Add << "\n");
+ for (auto &Mul : Muls)
+ LLVM_DEBUG(dbgs() << *Mul->Root << "\n"
+ << " " << *Mul->LHS << "\n"
+ << " " << *Mul->RHS << "\n");
+ LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n")
+ );
+ }
};
class WidenedLoad {
}
};
- class ARMParallelDSP : public LoopPass {
+ class ARMParallelDSP : public FunctionPass {
ScalarEvolution *SE;
AliasAnalysis *AA;
TargetLibraryInfo *TLI;
DominatorTree *DT;
- LoopInfo *LI;
- Loop *L;
const DataLayout *DL;
Module *M;
std::map<LoadInst*, LoadInst*> LoadPairs;
std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
template<unsigned>
- bool IsNarrowSequence(Value *V, Value *&Src);
-
+ bool IsNarrowSequence(Value *V);
+ bool Search(Value *V, BasicBlock *BB, Reduction &R);
bool RecordMemoryOps(BasicBlock *BB);
void InsertParallelMACs(Reduction &Reduction);
bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
/// products to a 32-bit accumulate operand. Optionally, the instruction can
/// exchange the halfwords of the second operand before performing the
/// arithmetic.
- bool MatchSMLAD(Loop *L);
+ bool MatchSMLAD(Function &F);
public:
static char ID;
- ARMParallelDSP() : LoopPass(ID) { }
-
- bool doInitialization(Loop *L, LPPassManager &LPM) override {
- LoadPairs.clear();
- WideLoads.clear();
- return true;
- }
+ ARMParallelDSP() : FunctionPass(ID) { }
void getAnalysisUsage(AnalysisUsage &AU) const override {
- LoopPass::getAnalysisUsage(AU);
+ FunctionPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetPassConfig>();
- AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
AU.setPreservesCFG();
}
- bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
+ bool runOnFunction(Function &F) override {
if (DisableParallelDSP)
return false;
- if (skipLoop(TheLoop))
+ if (skipFunction(F))
return false;
- L = TheLoop;
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &TPC = getAnalysis<TargetPassConfig>();
- BasicBlock *Header = TheLoop->getHeader();
- if (!Header)
- return false;
-
- // TODO: We assume the loop header and latch to be the same block.
- // This is not a fundamental restriction, but lifting this would just
- // require more work to do the transformation and then patch up the CFG.
- if (Header != TheLoop->getLoopLatch()) {
- LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
- "running pass ARMParallelDSP\n");
- return false;
- }
-
- if (!TheLoop->getLoopPreheader())
- InsertPreheaderForLoop(L, DT, LI, nullptr, true);
-
- Function &F = *Header->getParent();
M = F.getParent();
DL = &M->getDataLayout();
return false;
}
- LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
-
LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
- if (!RecordMemoryOps(Header)) {
- LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
- return false;
- }
-
- bool Changes = MatchSMLAD(L);
+ bool Changes = MatchSMLAD(F);
return Changes;
}
};
// TODO: we currently only collect i16, and will support i8 later, so that's
// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
template<unsigned MaxBitWidth>
-bool ARMParallelDSP::IsNarrowSequence(Value *V, Value *&Src) {
+bool ARMParallelDSP::IsNarrowSequence(Value *V) {
if (auto *SExt = dyn_cast<SExtInst>(V)) {
if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
return false;
if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) {
- // Check that these load could be paired.
- if (!LoadPairs.count(Ld) && !OffsetLoads.count(Ld))
- return false;
-
- Src = Ld;
- return true;
+ // Check that this load could be paired.
+ return LoadPairs.count(Ld) || OffsetLoads.count(Ld);
}
}
return false;
bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
SmallVector<LoadInst*, 8> Loads;
SmallVector<Instruction*, 8> Writes;
+ LoadPairs.clear();
+ WideLoads.clear();
// Collect loads and instruction that may write to memory. For now we only
// record loads which are simple, sign-extended and have a single user.
return LoadPairs.size() > 1;
}
-// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
+// Search recursively back through the operands to find a tree of values that
+// form a multiply-accumulate chain. The search records the Add and Mul
+// instructions that form the reduction and allows us to find a single value
+// to be used as the initial input to the accumlator.
+bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) {
+ // If we find a non-instruction, try to use it as the initial accumulator
+ // value. This may have already been found during the search in which case
+ // this function will return false, signaling a search fail.
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return R.InsertAcc(V);
+
+ if (I->getParent() != BB)
+ return false;
+
+ switch (I->getOpcode()) {
+ default:
+ break;
+ case Instruction::PHI:
+ // Could be the accumulator value.
+ return R.InsertAcc(V);
+ case Instruction::Add: {
+ // Adds should be adding together two muls, or another add and a mul to
+ // be within the mac chain. One of the operands may also be the
+ // accumulator value at which point we should stop searching.
+ R.InsertAdd(I);
+ Value *LHS = I->getOperand(0);
+ Value *RHS = I->getOperand(1);
+ bool ValidLHS = Search(LHS, BB, R);
+ bool ValidRHS = Search(RHS, BB, R);
+
+ if (ValidLHS && ValidRHS)
+ return true;
+
+ return R.InsertAcc(I);
+ }
+ case Instruction::Mul: {
+ Value *MulOp0 = I->getOperand(0);
+ Value *MulOp1 = I->getOperand(1);
+ return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1);
+ }
+ case Instruction::SExt:
+ return Search(I->getOperand(0), BB, R);
+ }
+ return false;
+}
+
+// The pass needs to identify integer add/sub reductions of 16-bit vector
// multiplications.
// To use SMLAD:
// 1) we first need to find integer add then look for this pattern:
// If loop invariants are used instead of loads, these need to be packed
// before the loop begins.
//
-bool ARMParallelDSP::MatchSMLAD(Loop *L) {
- // Search recursively back through the operands to find a tree of values that
- // form a multiply-accumulate chain. The search records the Add and Mul
- // instructions that form the reduction and allows us to find a single value
- // to be used as the initial input to the accumlator.
- std::function<bool(Value*, Reduction&)> Search = [&]
- (Value *V, Reduction &R) -> bool {
-
- // If we find a non-instruction, try to use it as the initial accumulator
- // value. This may have already been found during the search in which case
- // this function will return false, signaling a search fail.
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return R.InsertAcc(V);
-
- switch (I->getOpcode()) {
- default:
- break;
- case Instruction::PHI:
- // Could be the accumulator value.
- return R.InsertAcc(V);
- case Instruction::Add: {
- // Adds should be adding together two muls, or another add and a mul to
- // be within the mac chain. One of the operands may also be the
- // accumulator value at which point we should stop searching.
- bool ValidLHS = Search(I->getOperand(0), R);
- bool ValidRHS = Search(I->getOperand(1), R);
- if (!ValidLHS && !ValidLHS)
- return false;
- else if (ValidLHS && ValidRHS) {
- R.InsertAdd(I);
- return true;
- } else {
- R.InsertAdd(I);
- return R.InsertAcc(I);
- }
- }
- case Instruction::Mul: {
- Value *MulOp0 = I->getOperand(0);
- Value *MulOp1 = I->getOperand(1);
- if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) {
- Value *LHS = nullptr;
- Value *RHS = nullptr;
- if (IsNarrowSequence<16>(MulOp0, LHS) &&
- IsNarrowSequence<16>(MulOp1, RHS)) {
- R.InsertMul(I, LHS, RHS);
- return true;
- }
- }
- return false;
- }
- case Instruction::SExt:
- return Search(I->getOperand(0), R);
- }
- return false;
- };
-
+bool ARMParallelDSP::MatchSMLAD(Function &F) {
bool Changed = false;
- SmallPtrSet<Instruction*, 4> AllAdds;
- BasicBlock *Latch = L->getLoopLatch();
- for (Instruction &I : reverse(*Latch)) {
- if (I.getOpcode() != Instruction::Add)
+ for (auto &BB : F) {
+ SmallPtrSet<Instruction*, 4> AllAdds;
+ if (!RecordMemoryOps(&BB))
continue;
- if (AllAdds.count(&I))
- continue;
+ for (Instruction &I : reverse(BB)) {
+ if (I.getOpcode() != Instruction::Add)
+ continue;
- const auto *Ty = I.getType();
- if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
- continue;
+ if (AllAdds.count(&I))
+ continue;
- Reduction R(&I);
- if (!Search(&I, R))
- continue;
+ const auto *Ty = I.getType();
+ if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
+ continue;
- if (!CreateParallelPairs(R))
- continue;
+ Reduction R(&I);
+ if (!Search(&I, &BB, R))
+ continue;
+
+ R.InsertMuls();
+ LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump());
- InsertParallelMACs(R);
- Changed = true;
- AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
+ if (!CreateParallelPairs(R))
+ continue;
+
+ InsertParallelMACs(R);
+ Changed = true;
+ AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
+ }
}
return Changed;
auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
- LLVM_DEBUG(dbgs() << "Loads:\n"
- << " - " << *Ld0 << "\n"
- << " - " << *Ld1 << "\n"
- << " - " << *Ld2 << "\n"
- << " - " << *Ld3 << "\n");
-
if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
} else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
- PMul1->Exchange = true;
- R.AddMulPair(PMul0, PMul1);
+ R.AddMulPair(PMul0, PMul1, true);
return true;
}
} else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
LLVM_DEBUG(dbgs() << " and swapping muls\n");
- PMul0->Exchange = true;
// Only the second operand can be exchanged, so swap the muls.
- R.AddMulPair(PMul1, PMul0);
+ R.AddMulPair(PMul1, PMul0, true);
return true;
}
return false;
MulCandList &Muls = R.getMuls();
const unsigned Elems = Muls.size();
- SmallPtrSet<const Instruction*, 4> Paired;
for (unsigned i = 0; i < Elems; ++i) {
MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get());
- if (Paired.count(PMul0->Root))
+ if (PMul0->Paired)
continue;
for (unsigned j = 0; j < Elems; ++j) {
continue;
MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get());
- if (Paired.count(PMul1->Root))
+ if (PMul1->Paired)
continue;
const Instruction *Mul0 = PMul0->Root;
assert(PMul0 != PMul1 && "expected different chains");
- if (CanPair(R, PMul0, PMul1)) {
- Paired.insert(Mul0);
- Paired.insert(Mul1);
+ if (CanPair(R, PMul0, PMul1))
break;
- }
}
}
return !R.getMulPairs().empty();
Instruction *InsertAfter = R.getRoot();
Value *Acc = R.getAccumulator();
+
+ // For any muls that were discovered but not paired, accumulate their values
+ // as before.
+ IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+ ++BasicBlock::iterator(InsertAfter));
+ MulCandList &MulCands = R.getMuls();
+ for (auto &MulCand : MulCands) {
+ if (MulCand->Paired)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *MulCand->Root
+ << "\n");
+ if (!Acc) {
+ Acc = MulCand->Root;
+ continue;
+ }
+ Acc = Builder.CreateAdd(MulCand->Root, Acc);
+ InsertAfter = cast<Instruction>(Acc);
+ }
+
if (!Acc)
Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
IntegerType *Ty = IntegerType::get(M->getContext(), 32);
- LLVM_DEBUG(dbgs() << "Root: " << *InsertAfter << "\n"
- << "Acc: " << *Acc << "\n");
for (auto &Pair : R.getMulPairs()) {
MulCandidate *LHSMul = Pair.first;
MulCandidate *RHSMul = Pair.second;
- LLVM_DEBUG(dbgs() << "Muls:\n"
- << "- " << *LHSMul->Root << "\n"
- << "- " << *RHSMul->Root << "\n");
LoadInst *BaseLHS = LHSMul->getBaseLoad();
LoadInst *BaseRHS = RHSMul->getBaseLoad();
LoadInst *WideLHS = WideLoads.count(BaseLHS) ?
// Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
// TODO: Support big-endian as well.
Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
- BaseSExt->setOperand(0, Bottom);
+ Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
+ BaseSExt->replaceAllUsesWith(NewBaseSExt);
IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
- OffsetSExt->setOperand(0, Trunc);
-
+ Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
+ OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
+
+ LLVM_DEBUG(dbgs() << "From Base and Offset:\n"
+ << *Base << "\n" << *Offset << "\n"
+ << "Created Wide Load:\n"
+ << *WideLoad << "\n"
+ << *Bottom << "\n"
+ << *NewBaseSExt << "\n"
+ << *Top << "\n"
+ << *Trunc << "\n"
+ << *NewOffsetSExt << "\n");
WideLoads.emplace(std::make_pair(Base,
std::make_unique<WidenedLoad>(Loads, WideLoad)));
return WideLoad;
char ARMParallelDSP::ID = 0;
INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
- "Transform loops to use DSP intrinsics", false, false)
+ "Transform functions to use DSP intrinsics", false, false)
INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
- "Transform loops to use DSP intrinsics", false, false)
+ "Transform functions to use DSP intrinsics", false, false)
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results
-; CHECK-NEXT: Loop Pass Manager
-; CHECK-NEXT: Transform loops to use DSP intrinsics
+; CHECK-NEXT: Transform functions to use DSP intrinsics
; CHECK-NEXT: Interleaved Access Pass
; CHECK-NEXT: ARM IR optimizations
; CHECK-NEXT: Dominator Tree Construction
--- /dev/null
+; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
+
+; CHECK-LABEL: single_block
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc)
+define i32 @single_block(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.0, %sext.b.0
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %mul.1 = mul i32 %sext.a.1, %sext.b.1
+ %add = add i32 %mul.0, %mul.1
+ %res = add i32 %add, %acc
+ ret i32 %res
+}
+
+; CHECK-LABEL: multi_block
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0)
+define i32 @multi_block(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.0, %sext.b.0
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %mul.1 = mul i32 %sext.a.1, %sext.b.1
+ %add = add i32 %mul.0, %mul.1
+ br label %bb.1
+
+bb.1:
+ %res = add i32 %add, %acc
+ ret i32 %res
+}
+
+; CHECK-LABEL: multi_block_1
+; CHECK-NOT: call i32 @llvm.arm.smlad
+define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.0, %sext.b.0
+ br label %bb.1
+
+bb.1:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %mul.1 = mul i32 %sext.a.1, %sext.b.1
+ %add = add i32 %mul.0, %mul.1
+ %res = add i32 %add, %acc
+ ret i32 %res
+}
+
--- /dev/null
+; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
+
+; CHECK-LABEL: exchange_1
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
+define i32 @exchange_1(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.0, %sext.b.1
+ %mul.1 = mul i32 %sext.a.1, %sext.b.0
+ %add = add i32 %mul.0, %mul.1
+ %res = add i32 %add, %acc
+ ret i32 %res
+}
+
+; CHECK-LABEL: exchange_2
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
+define i32 @exchange_2(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.b.1, %sext.a.0
+ %mul.1 = mul i32 %sext.b.0, %sext.a.1
+ %add = add i32 %mul.0, %mul.1
+ %res = add i32 %add, %acc
+ ret i32 %res
+}
+
+; CHECK-LABEL: exchange_3
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
+define i32 @exchange_3(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.0, %sext.b.1
+ %mul.1 = mul i32 %sext.a.1, %sext.b.0
+ %add = add i32 %mul.1, %mul.0
+ %res = add i32 %add, %acc
+ ret i32 %res
+}
+
+; CHECK-LABEL: exchange_4
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
+define i32 @exchange_4(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.b.1, %sext.a.0
+ %mul.1 = mul i32 %sext.b.0, %sext.a.1
+ %add = add i32 %mul.1, %mul.0
+ %res = add i32 %add, %acc
+ ret i32 %res
+}
+
+; CHECK-LABEL: exchange_multi_use_1
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
+; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
+; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
+; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
+; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]])
+define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.0, %sext.b.1
+ %mul.1 = mul i32 %sext.a.1, %sext.b.0
+ %add = add i32 %mul.0, %mul.1
+ %addr.a.2 = getelementptr i16, i16* %a, i32 2
+ %addr.a.3 = getelementptr i16, i16* %a, i32 3
+ %ld.a.2 = load i16, i16* %addr.a.2
+ %ld.a.3 = load i16, i16* %addr.a.3
+ %sext.a.2 = sext i16 %ld.a.2 to i32
+ %sext.a.3 = sext i16 %ld.a.3 to i32
+ %mul.2 = mul i32 %sext.a.3, %sext.b.1
+ %mul.3 = mul i32 %sext.a.2, %sext.b.0
+ %add.1 = add i32 %mul.2, %mul.3
+ %add.2 = add i32 %add, %add.1
+ %res = add i32 %add.2, %acc
+ ret i32 %res
+}
+
+; CHECK-LABEL: exchange_multi_use_2
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
+; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
+; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
+; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]])
+define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.0, %sext.b.0
+ %mul.1 = mul i32 %sext.a.1, %sext.b.1
+ %add = add i32 %mul.0, %mul.1
+ %addr.a.2 = getelementptr i16, i16* %a, i32 2
+ %addr.a.3 = getelementptr i16, i16* %a, i32 3
+ %ld.a.2 = load i16, i16* %addr.a.2
+ %ld.a.3 = load i16, i16* %addr.a.3
+ %sext.a.2 = sext i16 %ld.a.2 to i32
+ %sext.a.3 = sext i16 %ld.a.3 to i32
+ %mul.2 = mul i32 %sext.b.0, %sext.a.3
+ %mul.3 = mul i32 %sext.b.1, %sext.a.2
+ %add.1 = add i32 %mul.2, %mul.3
+ %add.2 = add i32 %add, %add.1
+ %res = add i32 %add.2, %acc
+ ret i32 %res
+}
+
+; TODO: Why aren't two intrinsics generated?
+; CHECK-LABEL: exchange_multi_use_3
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
+; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
+; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
+; CHECK-NOT: call i32 @llvm.arm.smlad
+; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 0
+define i32 @exchange_multi_use_3(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %addr.a.2 = getelementptr i16, i16* %a, i32 2
+ %addr.a.3 = getelementptr i16, i16* %a, i32 3
+ %ld.a.2 = load i16, i16* %addr.a.2
+ %ld.a.3 = load i16, i16* %addr.a.3
+ %sext.a.2 = sext i16 %ld.a.2 to i32
+ %sext.a.3 = sext i16 %ld.a.3 to i32
+ %mul.2 = mul i32 %sext.b.0, %sext.a.3
+ %mul.3 = mul i32 %sext.b.1, %sext.a.2
+ %mul.0 = mul i32 %sext.a.0, %sext.b.0
+ %mul.1 = mul i32 %sext.a.1, %sext.b.1
+ %add = add i32 %mul.0, %mul.1
+ %add.1 = add i32 %mul.2, %mul.3
+ %sub = sub i32 %add, %add.1
+ %res = add i32 %acc, %sub
+ ret i32 %res
+}
+
+; TODO: Why isn't smladx generated too?
+; CHECK-LABEL: exchange_multi_use_4
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 0
+; CHECK-NOT: call i32 @llvm.arm.smlad
+define i32 @exchange_multi_use_4(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %addr.a.2 = getelementptr i16, i16* %a, i32 2
+ %addr.a.3 = getelementptr i16, i16* %a, i32 3
+ %ld.a.2 = load i16, i16* %addr.a.2
+ %ld.a.3 = load i16, i16* %addr.a.3
+ %sext.a.2 = sext i16 %ld.a.2 to i32
+ %sext.a.3 = sext i16 %ld.a.3 to i32
+ %mul.2 = mul i32 %sext.b.0, %sext.a.3
+ %mul.3 = mul i32 %sext.b.1, %sext.a.2
+ %mul.0 = mul i32 %sext.a.0, %sext.b.0
+ %mul.1 = mul i32 %sext.a.1, %sext.b.1
+ %add.1 = add i32 %mul.2, %mul.3
+ %add = add i32 %mul.0, %mul.1
+ %sub = sub i32 %add, %add.1
+ %res = add i32 %acc, %sub
+ ret i32 %res
+}
+
+; CHECK-LABEL: exchange_swap
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
+define i32 @exchange_swap(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.1, %sext.b.0
+ %mul.1 = mul i32 %sext.a.0, %sext.b.1
+ %add = add i32 %mul.0, %mul.1
+ %res = add i32 %add, %acc
+ ret i32 %res
+}
+
+; CHECK-LABEL: exchange_swap_2
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
+define i32 @exchange_swap_2(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.1, %sext.b.0
+ %mul.1 = mul i32 %sext.a.0, %sext.b.1
+ %add = add i32 %mul.1, %mul.0
+ %res = add i32 %add, %acc
+ ret i32 %res
+}
+
+; CHECK-LABEL: exchange_swap_3
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
+define i32 @exchange_swap_3(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.b.0, %sext.a.1
+ %mul.1 = mul i32 %sext.b.1, %sext.a.0
+ %add = add i32 %mul.1, %mul.0
+ %res = add i32 %add, %acc
+ ret i32 %res
+}
--- /dev/null
+; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
+
+; CHECK-LABEL: overlap_1
+; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1
+; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32*
+; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]]
+; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32*
+; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
+; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 %acc)
+; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC]])
+; CHECK: ret i32 [[RES]]
+define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.0, %sext.b.0
+ %mul.1 = mul i32 %sext.a.1, %sext.b.1
+ %addr.a.2 = getelementptr i16, i16* %a, i32 2
+ %addr.b.2 = getelementptr i16, i16* %b, i32 2
+ %ld.a.2 = load i16, i16* %addr.a.2
+ %ld.b.2 = load i16, i16* %addr.b.2
+ %sext.a.2 = sext i16 %ld.a.2 to i32
+ %sext.b.2 = sext i16 %ld.b.2 to i32
+ %mul.2 = mul i32 %sext.a.2, %sext.b.2
+ %add = add i32 %mul.0, %mul.1
+ %add.1 = add i32 %mul.1, %mul.2
+ %add.2 = add i32 %add.1, %add
+ %res = add i32 %add.2, %acc
+ ret i32 %res
+}
+
+; CHECK-LABEL: overlap_2
+; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1
+; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[ACC1:%[^ ]+]] = add i32 %mul.1, %acc
+; CHECK: [[ACC2:%[^ ]+]] = add i32 %mul.2, [[ACC1]]
+; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC2]])
+; CHECK: ret i32 [[RES]]
+define i32 @overlap_2(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.0, %sext.b.0
+ %mul.1 = mul i32 %sext.a.1, %sext.b.1
+ %addr.a.2 = getelementptr i16, i16* %a, i32 2
+ %addr.b.2 = getelementptr i16, i16* %b, i32 2
+ %ld.a.2 = load i16, i16* %addr.a.2
+ %ld.b.2 = load i16, i16* %addr.b.2
+ %sext.a.2 = sext i16 %ld.a.2 to i32
+ %sext.b.2 = sext i16 %ld.b.2 to i32
+ %mul.2 = mul i32 %sext.b.2, %sext.a.2
+ %add = add i32 %mul.0, %mul.1
+ %add.1 = add i32 %mul.1, %mul.2
+ %add.2 = add i32 %add, %add.1
+ %res = add i32 %add.2, %acc
+ ret i32 %res
+}
+
+; CHECK-LABEL: overlap_3
+; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
+; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
+; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
+; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
+; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
+; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc)
+; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]])
+define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.0, %sext.b.0
+ %mul.1 = mul i32 %sext.a.1, %sext.b.1
+ %addr.a.2 = getelementptr i16, i16* %a, i32 2
+ %addr.b.2 = getelementptr i16, i16* %b, i32 2
+ %addr.a.3 = getelementptr i16, i16* %a, i32 3
+ %ld.a.2 = load i16, i16* %addr.a.2
+ %ld.b.2 = load i16, i16* %addr.b.2
+ %ld.a.3 = load i16, i16* %addr.a.3
+ %sext.a.2 = sext i16 %ld.a.2 to i32
+ %sext.b.2 = sext i16 %ld.b.2 to i32
+ %sext.a.3 = sext i16 %ld.a.3 to i32
+ %mul.2 = mul i32 %sext.a.2, %sext.b.1
+ %mul.3 = mul i32 %sext.a.3, %sext.b.2
+ %add = add i32 %mul.0, %mul.1
+ %add.1 = add i32 %mul.2, %mul.3
+ %add.2 = add i32 %add.1, %add
+ %res = add i32 %add.2, %acc
+ ret i32 %res
+}
+
+; CHECK-LABEL: overlap_4
+; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
+; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
+; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
+; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
+; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
+; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
+; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
+; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
+; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
+; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc)
+; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]])
+define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) {
+entry:
+ %addr.a.1 = getelementptr i16, i16* %a, i32 1
+ %addr.b.1 = getelementptr i16, i16* %b, i32 1
+ %ld.a.0 = load i16, i16* %a
+ %sext.a.0 = sext i16 %ld.a.0 to i32
+ %ld.b.0 = load i16, i16* %b
+ %ld.a.1 = load i16, i16* %addr.a.1
+ %ld.b.1 = load i16, i16* %addr.b.1
+ %sext.a.1 = sext i16 %ld.a.1 to i32
+ %sext.b.1 = sext i16 %ld.b.1 to i32
+ %sext.b.0 = sext i16 %ld.b.0 to i32
+ %mul.0 = mul i32 %sext.a.0, %sext.b.0
+ %mul.1 = mul i32 %sext.a.1, %sext.b.1
+ %addr.a.2 = getelementptr i16, i16* %a, i32 2
+ %addr.b.2 = getelementptr i16, i16* %b, i32 2
+ %addr.a.3 = getelementptr i16, i16* %a, i32 3
+ %ld.a.2 = load i16, i16* %addr.a.2
+ %ld.b.2 = load i16, i16* %addr.b.2
+ %ld.a.3 = load i16, i16* %addr.a.3
+ %sext.a.2 = sext i16 %ld.a.2 to i32
+ %sext.b.2 = sext i16 %ld.b.2 to i32
+ %sext.a.3 = sext i16 %ld.a.3 to i32
+ %mul.2 = mul i32 %sext.b.2, %sext.a.2
+ %mul.3 = mul i32 %sext.b.1, %sext.a.3
+ %add = add i32 %mul.0, %mul.1
+ %add.1 = add i32 %mul.2, %mul.3
+ %add.2 = add i32 %add.1, %add
+ %res = add i32 %add.2, %acc
+ ret i32 %res
+}
; CHECK: [[GEP16:%[^ ]+]] = getelementptr i16, i16* [[CAST_GEP8]], i32 6
; CHECK: [[CAST_GEP16:%[^ ]+]] = bitcast i16* [[GEP16]] to i32*
; CHECK: [[LOAD_UNDEF:%[^ ]+]] = load i32, i32* [[CAST_GEP16]], align 2
-; CHECK: call i32 @llvm.arm.smladx(i32 [[LOAD_A]], i32 [[LOAD_UNDEF]], i32 undef)
+; CHECK: call i32 @llvm.arm.smladx(i32 [[LOAD_UNDEF]], i32 [[LOAD_A]], i32 undef)
define void @undef_no_return(i16* %a) {
entry:
%incdec.ptr21 = getelementptr inbounds i16, i16* %a, i32 3
; CHECK: [[GEP16:%[^ ]+]] = getelementptr i16, i16* [[CAST_GEP8]], i32 %iv
; CHECK: [[CAST_GEP16:%[^ ]+]] = bitcast i16* [[GEP16]] to i32*
; CHECK: [[LOAD_B:%[^ ]+]] = load i32, i32* [[CAST_GEP16]], align 2
-; CHECK: [[ACC_NEXT]] = call i32 @llvm.arm.smladx(i32 [[LOAD_A]], i32 [[LOAD_B]], i32 [[ACC]])
+; CHECK: [[ACC_NEXT]] = call i32 @llvm.arm.smladx(i32 [[LOAD_B]], i32 [[LOAD_A]], i32 [[ACC]])
define i32 @return(i16* %a, i8* %b, i32 %N) {
entry:
%incdec.ptr21 = getelementptr inbounds i16, i16* %a, i32 3
--- /dev/null
+; RUN: opt -mtriple=thumbv7-unknown-linux-gnueabihf -arm-parallel-dsp -dce %s -S -o - | FileCheck %s
+
+; CHECK-LABEL: first_mul_invalid
+; CHECK: [[ADDR_IN_MINUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -1
+; CHECK: [[LD_IN_MINUS_1:%[^ ]+]] = load i16, i16* [[ADDR_IN_MINUS_1]], align 2
+; CHECK: [[IN_MINUS_1:%[^ ]+]] = sext i16 [[LD_IN_MINUS_1]] to i32
+; CHECK: [[ADDR_B_PLUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 1
+; CHECK: [[LD_B_PLUS_1:%[^ ]+]] = load i16, i16* [[ADDR_B_PLUS_1]], align 2
+; CHECK: [[B_PLUS_1:%[^ ]+]] = sext i16 [[LD_B_PLUS_1]] to i32
+; CHECK: [[MUL0:%[^ ]+]] = mul nsw i32 [[B_PLUS_1]], [[IN_MINUS_1]]
+; CHECK: [[ADD0:%[^ ]+]] = add i32 [[MUL0]], %call
+; CHECK: [[ADDR_IN_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -3
+; CHECK: [[CAST_ADDR_IN_MINUS_3:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_3]] to i32*
+; CHECK: [[IN_MINUS_3:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_3]], align 2
+; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2
+; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32*
+; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2
+; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5
+; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32*
+; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2
+; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4
+; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32*
+; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2
+; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ADD0]])
+; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]])
+; CHECK: ret i32 [[RES]]
+define i32 @first_mul_invalid(i16* nocapture readonly %in, i16* nocapture readonly %b) {
+entry:
+ %0 = load i16, i16* %in, align 2
+ %conv = sext i16 %0 to i32
+ %1 = load i16, i16* %b, align 2
+ %conv2 = sext i16 %1 to i32
+ %call = tail call i32 @bar(i32 %conv, i32 %conv2)
+ %arrayidx3 = getelementptr inbounds i16, i16* %in, i32 -1
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv4 = sext i16 %2 to i32
+ %arrayidx5 = getelementptr inbounds i16, i16* %b, i32 1
+ %3 = load i16, i16* %arrayidx5, align 2
+ %conv6 = sext i16 %3 to i32
+ %mul = mul nsw i32 %conv6, %conv4
+ %add = add i32 %mul, %call
+ %arrayidx7 = getelementptr inbounds i16, i16* %in, i32 -2
+ %4 = load i16, i16* %arrayidx7, align 2
+ %conv8 = sext i16 %4 to i32
+ %arrayidx9 = getelementptr inbounds i16, i16* %b, i32 2
+ %5 = load i16, i16* %arrayidx9, align 2
+ %conv10 = sext i16 %5 to i32
+ %mul11 = mul nsw i32 %conv10, %conv8
+ %add12 = add i32 %add, %mul11
+ %arrayidx13 = getelementptr inbounds i16, i16* %in, i32 -3
+ %6 = load i16, i16* %arrayidx13, align 2
+ %conv14 = sext i16 %6 to i32
+ %arrayidx15 = getelementptr inbounds i16, i16* %b, i32 3
+ %7 = load i16, i16* %arrayidx15, align 2
+ %conv16 = sext i16 %7 to i32
+ %mul17 = mul nsw i32 %conv16, %conv14
+ %add18 = add i32 %add12, %mul17
+ %arrayidx19 = getelementptr inbounds i16, i16* %in, i32 -4
+ %8 = load i16, i16* %arrayidx19, align 2
+ %conv20 = sext i16 %8 to i32
+ %arrayidx21 = getelementptr inbounds i16, i16* %b, i32 4
+ %9 = load i16, i16* %arrayidx21, align 2
+ %conv22 = sext i16 %9 to i32
+ %mul23 = mul nsw i32 %conv22, %conv20
+ %add24 = add i32 %add18, %mul23
+ %arrayidx25 = getelementptr inbounds i16, i16* %in, i32 -5
+ %10 = load i16, i16* %arrayidx25, align 2
+ %conv26 = sext i16 %10 to i32
+ %arrayidx27 = getelementptr inbounds i16, i16* %b, i32 5
+ %11 = load i16, i16* %arrayidx27, align 2
+ %conv28 = sext i16 %11 to i32
+ %mul29 = mul nsw i32 %conv28, %conv26
+ %add30 = add i32 %add24, %mul29
+ ret i32 %add30
+}
+
+; CHECK-LABEL: with_no_acc_input
+; CHECK: [[ADDR_IN_MINUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -1
+; CHECK: [[LD_IN_MINUS_1:%[^ ]+]] = load i16, i16* [[ADDR_IN_MINUS_1]], align 2
+; CHECK: [[IN_MINUS_1:%[^ ]+]] = sext i16 [[LD_IN_MINUS_1]] to i32
+; CHECK: [[ADDR_B_PLUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 1
+; CHECK: [[LD_B_PLUS_1:%[^ ]+]] = load i16, i16* [[ADDR_B_PLUS_1]], align 2
+; CHECK: [[B_PLUS_1:%[^ ]+]] = sext i16 [[LD_B_PLUS_1]] to i32
+; CHECK: [[MUL0:%[^ ]+]] = mul nsw i32 [[B_PLUS_1]], [[IN_MINUS_1]]
+; CHECK: [[ADDR_IN_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -3
+; CHECK: [[CAST_ADDR_IN_MINUS_3:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_3]] to i32*
+; CHECK: [[IN_MINUS_3:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_3]], align 2
+; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2
+; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32*
+; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2
+; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5
+; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32*
+; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2
+; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4
+; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32*
+; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2
+; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[MUL0]])
+; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]])
+; CHECK: ret i32 [[RES]]
+define i32 @with_no_acc_input(i16* nocapture readonly %in, i16* nocapture readonly %b) {
+entry:
+ %arrayidx3 = getelementptr inbounds i16, i16* %in, i32 -1
+ %ld.2 = load i16, i16* %arrayidx3, align 2
+ %conv4 = sext i16 %ld.2 to i32
+ %arrayidx5 = getelementptr inbounds i16, i16* %b, i32 1
+ %ld.3 = load i16, i16* %arrayidx5, align 2
+ %conv6 = sext i16 %ld.3 to i32
+ %mul = mul nsw i32 %conv6, %conv4
+ %arrayidx7 = getelementptr inbounds i16, i16* %in, i32 -2
+ %ld.4 = load i16, i16* %arrayidx7, align 2
+ %conv8 = sext i16 %ld.4 to i32
+ %arrayidx9 = getelementptr inbounds i16, i16* %b, i32 2
+ %ld.5 = load i16, i16* %arrayidx9, align 2
+ %conv10 = sext i16 %ld.5 to i32
+ %mul11 = mul nsw i32 %conv10, %conv8
+ %add12 = add i32 %mul, %mul11
+ %arrayidx13 = getelementptr inbounds i16, i16* %in, i32 -3
+ %ld.6 = load i16, i16* %arrayidx13, align 2
+ %conv14 = sext i16 %ld.6 to i32
+ %arrayidx15 = getelementptr inbounds i16, i16* %b, i32 3
+ %ld.7 = load i16, i16* %arrayidx15, align 2
+ %conv16 = sext i16 %ld.7 to i32
+ %mul17 = mul nsw i32 %conv16, %conv14
+ %add18 = add i32 %add12, %mul17
+ %arrayidx19 = getelementptr inbounds i16, i16* %in, i32 -4
+ %ld.8 = load i16, i16* %arrayidx19, align 2
+ %conv20 = sext i16 %ld.8 to i32
+ %arrayidx21 = getelementptr inbounds i16, i16* %b, i32 4
+ %ld.9 = load i16, i16* %arrayidx21, align 2
+ %conv22 = sext i16 %ld.9 to i32
+ %mul23 = mul nsw i32 %conv22, %conv20
+ %add24 = add i32 %add18, %mul23
+ %arrayidx25 = getelementptr inbounds i16, i16* %in, i32 -5
+ %ld.10 = load i16, i16* %arrayidx25, align 2
+ %conv26 = sext i16 %ld.10 to i32
+ %arrayidx27 = getelementptr inbounds i16, i16* %b, i32 5
+ %ld.11 = load i16, i16* %arrayidx27, align 2
+ %conv28 = sext i16 %ld.11 to i32
+ %mul29 = mul nsw i32 %conv28, %conv26
+ %add30 = add i32 %add24, %mul29
+ ret i32 %add30
+}
+
+declare dso_local i32 @bar(i32, i32) local_unnamed_addr
+
; CHECK: [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2
; CHECK: [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32*
; CHECK: [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2
-; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054)
-; CHECK: [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 [[V12]])
+; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 %mac1{{\.}}054)
+; CHECK: [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 [[V12]])
;
; And we don't want to see a 3rd smlad:
; CHECK-NOT: call i32 @llvm.arm.smlad
;
; The loop header is not the loop latch.
;
-; CHECK-NOT: call i32 @llvm.arm.smlad
+; CHECK: call i32 @llvm.arm.smlad
;
define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
entry:
; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*
; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2
-; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]])
-; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN1]], i32 [[IN2_2]], i64 [[ACC1]])
+; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]])
+; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]])
; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4
; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4