using OpChainList = SmallVector<std::unique_ptr<OpChain>, 8>;
using ReductionList = SmallVector<Reduction, 8>;
using ValueList = SmallVector<Value*, 8>;
- using MemInstList = SmallVector<Instruction*, 8>;
+ using MemInstList = SmallVector<LoadInst*, 8>;
using PMACPair = std::pair<BinOpChain*,BinOpChain*>;
using PMACPairList = SmallVector<PMACPair, 8>;
using Instructions = SmallVector<Instruction*,16>;
Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
};
+ class WidenedLoad {
+ LoadInst *NewLd = nullptr;
+ SmallVector<LoadInst*, 4> Loads;
+
+ public:
+ WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
+ : NewLd(Wide) {
+ for (auto *I : Lds)
+ Loads.push_back(I);
+ }
+ LoadInst *getLoad() {
+ return NewLd;
+ }
+ };
+
class ARMParallelDSP : public LoopPass {
ScalarEvolution *SE;
AliasAnalysis *AA;
const DataLayout *DL;
Module *M;
std::map<LoadInst*, LoadInst*> LoadPairs;
- std::map<LoadInst*, SmallVector<LoadInst*, 4>> SequentialLoads;
+ std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
- bool RecordSequentialLoads(BasicBlock *Header);
+ bool RecordSequentialLoads(BasicBlock *BB);
bool InsertParallelMACs(Reduction &Reduction);
bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
+ LoadInst* CreateLoadIns(IRBuilder<NoFolder> &IRB,
+ SmallVectorImpl<LoadInst*> &Loads,
+ IntegerType *LoadTy);
void CreateParallelMACPairs(Reduction &R);
- Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
+ Instruction *CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
+ SmallVectorImpl<LoadInst*> &VecLd1,
Instruction *Acc, bool Exchange,
Instruction *InsertAfter);
}
LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
- bool Changes = false;
LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
return false;
}
- Changes = MatchSMLAD(F);
+ bool Changes = MatchSMLAD(F);
return Changes;
}
};
// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
template<unsigned MaxBitWidth>
static bool IsNarrowSequence(Value *V, ValueList &VL) {
- LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
ConstantInt *CInt;
if (match(V, m_ConstantInt(CInt))) {
} else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
// TODO: we need to implement sadd16/sadd8 for this, which enables to
// also do the rewrite for smlad8.ll, but it is unsupported for now.
- LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
return false;
} else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
- if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
- LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
- cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
+ if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
return false;
- }
if (match(Val, m_Load(m_Value()))) {
- LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
VL.push_back(Val);
VL.push_back(I);
return true;
}
}
- LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
return false;
}
template<typename MemInst>
static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
const DataLayout &DL, ScalarEvolution &SE) {
- if (!MemOp0->isSimple() || !MemOp1->isSimple()) {
- LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
- return false;
- }
- if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
- LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
+ if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE))
return true;
- }
- LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n");
return false;
}
if (!Ld0 || !Ld1)
return false;
- LLVM_DEBUG(dbgs() << "Are consecutive loads:\n";
+ if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
dbgs() << "Ld0:"; Ld0->dump();
dbgs() << "Ld1:"; Ld1->dump();
);
- if (!Ld0->hasOneUse() || !Ld1->hasOneUse()) {
- LLVM_DEBUG(dbgs() << "No, load has more than one use.\n");
- return false;
- }
-
- if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
- return false;
-
VecMem.clear();
VecMem.push_back(Ld0);
VecMem.push_back(Ld1);
/// Iterate through the block and record base, offset pairs of loads as well as
/// maximal sequences of sequential loads.
-bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) {
+bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *BB) {
SmallVector<LoadInst*, 8> Loads;
- for (auto &I : *Header) {
+ for (auto &I : *BB) {
auto *Ld = dyn_cast<LoadInst>(&I);
- if (!Ld)
+ if (!Ld || !Ld->isSimple() ||
+ !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
continue;
Loads.push_back(Ld);
}
- std::map<LoadInst*, LoadInst*> BaseLoads;
-
for (auto *Ld0 : Loads) {
for (auto *Ld1 : Loads) {
if (Ld0 == Ld1)
if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
LoadPairs[Ld0] = Ld1;
- if (BaseLoads.count(Ld0)) {
- LoadInst *Base = BaseLoads[Ld0];
- BaseLoads[Ld1] = Base;
- SequentialLoads[Base].push_back(Ld1);
- } else {
- BaseLoads[Ld1] = Ld0;
- SequentialLoads[Ld0].push_back(Ld1);
- }
+ break;
}
}
}
+
+ LLVM_DEBUG(if (!LoadPairs.empty()) {
+ dbgs() << "Consecutive load pairs:\n";
+ for (auto &MapIt : LoadPairs) {
+ LLVM_DEBUG(dbgs() << *MapIt.first << ", "
+ << *MapIt.second << "\n");
+ }
+ });
return LoadPairs.size() > 1;
}
if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
return false;
- LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
- << "\t Ld0: " << *Ld0 << "\n"
- << "\t Ld1: " << *Ld1 << "\n"
- << "and operands " << x + 2 << ":\n"
- << "\t Ld2: " << *Ld2 << "\n"
- << "\t Ld3: " << *Ld3 << "\n");
+ LLVM_DEBUG(dbgs() << "Loads:\n"
+ << " - " << *Ld0 << "\n"
+ << " - " << *Ld1 << "\n"
+ << " - " << *Ld2 << "\n"
+ << " - " << *Ld3 << "\n");
if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
assert(PMul0 != PMul1 && "expected different chains");
- LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
- dbgs() << "- "; Mul0->dump();
- dbgs() << "- "; Mul1->dump());
-
- LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
if (CanPair(PMul0, PMul1)) {
Paired.insert(Mul0);
Paired.insert(Mul1);
dbgs() << "- "; PMul0->Root->dump();
dbgs() << "- "; PMul1->Root->dump());
- auto *VecLd0 = cast<LoadInst>(PMul0->VecLd[0]);
- auto *VecLd1 = cast<LoadInst>(PMul1->VecLd[0]);
- Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, PMul1->Exchange, InsertAfter);
+ Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange,
+ InsertAfter);
InsertAfter = Acc;
}
static void AddMACCandidate(OpChainList &Candidates,
Instruction *Mul,
Value *MulOp0, Value *MulOp1) {
- LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
assert(Mul->getOpcode() == Instruction::Mul &&
"expected mul instruction");
ValueList LHS;
ValueList RHS;
if (IsNarrowSequence<16>(MulOp0, LHS) &&
IsNarrowSequence<16>(MulOp1, RHS)) {
- LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
}
}
static void MatchParallelMACSequences(Reduction &R,
OpChainList &Candidates) {
Instruction *Acc = R.AccIntAdd;
- LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc);
+ LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc << "\n");
// Returns false to signal the search should be stopped.
std::function<bool(Value*)> Match =
return Changed;
}
-static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst &BaseLoad,
- Type *LoadTy) {
- const unsigned AddrSpace = BaseLoad.getPointerAddressSpace();
-
- Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(),
+LoadInst* ARMParallelDSP::CreateLoadIns(IRBuilder<NoFolder> &IRB,
+ SmallVectorImpl<LoadInst*> &Loads,
+ IntegerType *LoadTy) {
+ assert(Loads.size() == 2 && "currently only support widening two loads");
+
+ const unsigned AddrSpace = Loads[0]->getPointerAddressSpace();
+ Value *VecPtr = IRB.CreateBitCast(Loads[0]->getPointerOperand(),
LoadTy->getPointerTo(AddrSpace));
- return IRB.CreateAlignedLoad(LoadTy, VecPtr, BaseLoad.getAlignment());
+ LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
+ Loads[0]->getAlignment());
+ // Fix up users, Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
+ Instruction *SExt0 = dyn_cast<SExtInst>(Loads[0]->user_back());
+ Instruction *SExt1 = dyn_cast<SExtInst>(Loads[1]->user_back());
+
+ assert((Loads[0]->hasOneUse() && Loads[1]->hasOneUse() && SExt0 && SExt1) &&
+ "Loads should have a single, extending, user");
+
+ std::function<void(Instruction*, Instruction*)> MoveAfter =
+ [&](Instruction* Source, Instruction* Sink) -> void {
+ if (DT->dominates(Source, Sink) ||
+ Source->getParent() != Sink->getParent() ||
+ isa<PHINode>(Source) || isa<PHINode>(Sink))
+ return;
+
+ Sink->moveAfter(Source);
+ for (auto &U : Sink->uses())
+ MoveAfter(Sink, cast<Instruction>(U.getUser()));
+ };
+
+ // From the wide load, create two values that equal the original two loads.
+ Value *Bottom = IRB.CreateTrunc(WideLoad, Loads[0]->getType());
+ SExt0->setOperand(0, Bottom);
+ if (auto *I = dyn_cast<Instruction>(Bottom)) {
+ I->moveAfter(WideLoad);
+ MoveAfter(I, SExt0);
+ }
+
+ IntegerType *Ld1Ty = cast<IntegerType>(Loads[1]->getType());
+ Value *ShiftVal = ConstantInt::get(LoadTy, Ld1Ty->getBitWidth());
+ Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
+ if (auto *I = dyn_cast<Instruction>(Top))
+ MoveAfter(WideLoad, I);
+
+ Value *Trunc = IRB.CreateTrunc(Top, Ld1Ty);
+ SExt1->setOperand(0, Trunc);
+ if (auto *I = dyn_cast<Instruction>(Trunc))
+ MoveAfter(I, SExt1);
+
+ WideLoads.emplace(std::make_pair(Loads[0],
+ make_unique<WidenedLoad>(Loads, WideLoad)));
+ return WideLoad;
}
-Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
+Instruction *ARMParallelDSP::CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
+ SmallVectorImpl<LoadInst*> &VecLd1,
Instruction *Acc, bool Exchange,
Instruction *InsertAfter) {
LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n"
- << "- " << *VecLd0 << "\n"
- << "- " << *VecLd1 << "\n"
+ << "- " << *VecLd0[0] << "\n"
+ << "- " << *VecLd0[1] << "\n"
+ << "- " << *VecLd1[0] << "\n"
+ << "- " << *VecLd1[1] << "\n"
<< "- " << *Acc << "\n"
- << "Exchange: " << Exchange << "\n");
+ << "- Exchange: " << Exchange << "\n");
IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
++BasicBlock::iterator(InsertAfter));
// Replace the reduction chain with an intrinsic call
- Type *Ty = IntegerType::get(M->getContext(), 32);
- LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty);
- LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty);
- Value* Args[] = { NewLd0, NewLd1, Acc };
+ IntegerType *Ty = IntegerType::get(M->getContext(), 32);
+ LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ?
+ WideLoads[VecLd0[0]]->getLoad() : CreateLoadIns(Builder, VecLd0, Ty);
+ LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ?
+ WideLoads[VecLd1[0]]->getLoad() : CreateLoadIns(Builder, VecLd1, Ty);
+ Value* Args[] = { WideLd0, WideLd1, Acc };
Function *SMLAD = nullptr;
if (Exchange)
SMLAD = Acc->getType()->isIntegerTy(32) ?
}
const unsigned Pairs = VL0.size();
- LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
for (unsigned i = 0; i < Pairs; ++i) {
const Value *V0 = VL0[i];
const auto *Inst0 = dyn_cast<Instruction>(V0);
const auto *Inst1 = dyn_cast<Instruction>(V1);
- LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
- dbgs() << "mul1: "; V0->dump();
- dbgs() << "mul2: "; V1->dump());
-
if (!Inst0 || !Inst1)
return false;
- if (Inst0->isSameOperationAs(Inst1)) {
- LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
+ if (Inst0->isSameOperationAs(Inst1))
continue;
- }
const APInt *C0, *C1;
if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
return false;
}
- LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
return true;
};
--- /dev/null
+; RUN: llc -O3 -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s | FileCheck %s
+
+; CHECK-LABEL: add_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxtah [[COUNT:r[0-9]+]], [[COUNT]], [[A]]
+define i32 @add_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+ %res = add i32 %mac1.0.lcssa, %count.final
+ ret i32 %res
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %count.next = add i32 %conv4, %count
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: mul_bottom_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxth [[SXT:r[0-9]+]], [[A]]
+; CHECK: mul [[COUNT:r[0-9]+]], [[SXT]], [[COUNT]]
+define i32 @mul_bottom_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+ %res = add i32 %mac1.0.lcssa, %count.final
+ ret i32 %res
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %count.next = mul i32 %conv4, %count
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: mul_top_user
+; CHECK: %for.body
+; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: asr.w [[ASR:[rl0-9]+]], [[ASR]], #16
+; CHECK: mul [[COUNT:[rl0-9]+]], [[ASR]], [[COUNT]]
+define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+ %res = add i32 %mac1.0.lcssa, %count.final
+ ret i32 %res
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %count.next = mul i32 %conv7, %count
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: and_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: uxth [[UXT:r[0-9]+]], [[A]]
+; CHECK: mul [[MUL:r[0-9]+]], [[UXT]], [[MUL]]
+define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+ %res = add i32 %mac1.0.lcssa, %count.final
+ ret i32 %res
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %bottom = and i32 %conv4, 65535
+ %mul = mul nsw i32 %conv, %conv4
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %count.next = mul i32 %bottom, %count
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: multi_uses
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]], [{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]], [{{.*}}, #2]!
+; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxth [[SXT:r[0-9]+]], [[A]]
+; CHECK: eor.w [[EOR:r[0-9]+]], [[SXT]], [[SHIFT:r[0-9]+]]
+; CHECK: mul [[MUL:r[0-9]+]], [[EOR]], [[SXT]]
+; CHECK: lsl.w [[SHIFT]], [[MUL]], #16
+define i32 @multi_uses(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+ %res = add i32 %mac1.0.lcssa, %count.final
+ ret i32 %res
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %bottom = and i32 %conv4, 65535
+ %mul = mul nsw i32 %conv, %conv4
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %shl = shl i32 %conv4, 16
+ %add11 = add i32 %mul9, %add10
+ %xor = xor i32 %bottom, %count
+ %count.next = mul i32 %xor, %shl
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
%exitcond = icmp ne i32 %add, %arg
br i1 %exitcond, label %for.body, label %for.cond.cleanup
}
+
--- /dev/null
+; RUN: llc -O3 -mtriple=thumbv7em %s -o - | FileCheck %s
+; RUN: llc -O3 -mtriple=thumbv8m.main -mattr=+dsp %s -o - | FileCheck %s
+
+; Test that the duplicate loads are removed, which allows parallel dsp to find
+; the parallel operations.
+
+define void @unroll_n_jam_smlad(i32* %res, i16* %A, i16* %B, i32 %N, i32 %idx) {
+entry:
+ %xtraiter306.i = and i32 %N, 3
+ %unroll_iter310.i = sub i32 %N, %xtraiter306.i
+ %arrayidx.us.i117.i = getelementptr inbounds i32, i32* %res, i32 %idx
+ store i32 0, i32* %arrayidx.us.i117.i, align 4
+ %mul.us.i118.i = mul i32 %idx, %N
+ %inc11.us.i.i = or i32 %idx, 1
+ %arrayidx.us.i117.1.i = getelementptr inbounds i32, i32* %res, i32 %inc11.us.i.i
+ store i32 0, i32* %arrayidx.us.i117.1.i, align 4
+ %mul.us.i118.1.i = mul i32 %inc11.us.i.i, %N
+ %inc11.us.i.1.i = or i32 %idx, 2
+ %arrayidx.us.i117.2.i = getelementptr inbounds i32, i32* %res, i32 %inc11.us.i.1.i
+ store i32 0, i32* %arrayidx.us.i117.2.i, align 4
+ %mul.us.i118.2.i = mul i32 %inc11.us.i.1.i, %N
+ %inc11.us.i.2.i = or i32 %idx, 3
+ %arrayidx.us.i117.3.i = getelementptr inbounds i32, i32* %res, i32 %inc11.us.i.2.i
+ store i32 0, i32* %arrayidx.us.i117.3.i, align 4
+ %mul.us.i118.3.i = mul i32 %inc11.us.i.2.i, %N
+ %inc11.us.i.3.i = add i32 %idx, 4
+ br label %for.body
+
+; CHECK: %for.body
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+
+for.body:
+ %A3 = phi i32 [ %add9.us.i.3361.i, %for.body ], [ 0, %entry ]
+ %j.026.us.i.i = phi i32 [ %inc.us.i.3362.i, %for.body ], [ 0, %entry ]
+ %A4 = phi i32 [ %add9.us.i.1.3.i, %for.body ], [ 0, %entry ]
+ %A5 = phi i32 [ %add9.us.i.2.3.i, %for.body ], [ 0, %entry ]
+ %A6 = phi i32 [ %add9.us.i.3.3.i, %for.body ], [ 0, %entry ]
+ %niter335.i = phi i32 [ %niter335.nsub.3.i, %for.body ], [ %unroll_iter310.i, %entry ]
+ %add.us.i.i = add i32 %j.026.us.i.i, %mul.us.i118.i
+ %arrayidx4.us.i.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.i
+ %A7 = load i16, i16* %arrayidx4.us.i.i, align 2
+ %conv.us.i.i = sext i16 %A7 to i32
+ %arrayidx5.us.i.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i
+ %A8 = load i16, i16* %arrayidx5.us.i.i, align 2
+ %conv6.us.i.i = sext i16 %A8 to i32
+ %mul7.us.i.i = mul nsw i32 %conv6.us.i.i, %conv.us.i.i
+ %add9.us.i.i = add nsw i32 %mul7.us.i.i, %A3
+ %inc.us.i.i = or i32 %j.026.us.i.i, 1
+ %add.us.i.1.i = add i32 %j.026.us.i.i, %mul.us.i118.1.i
+ %arrayidx4.us.i.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.i
+ %A9 = load i16, i16* %arrayidx4.us.i.1.i, align 2
+ %conv.us.i.1.i = sext i16 %A9 to i32
+ %arrayidx5.us.i.1.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i
+ %B0 = load i16, i16* %arrayidx5.us.i.1.i, align 2
+ %conv6.us.i.1.i = sext i16 %B0 to i32
+ %mul7.us.i.1.i = mul nsw i32 %conv6.us.i.1.i, %conv.us.i.1.i
+ %add9.us.i.1.i = add nsw i32 %mul7.us.i.1.i, %A4
+ %inc.us.i.1.i = or i32 %j.026.us.i.i, 1
+ %add.us.i.2.i = add i32 %j.026.us.i.i, %mul.us.i118.2.i
+ %arrayidx4.us.i.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.i
+ %B1 = load i16, i16* %arrayidx4.us.i.2.i, align 2
+ %conv.us.i.2.i = sext i16 %B1 to i32
+ %arrayidx5.us.i.2.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i
+ %B2 = load i16, i16* %arrayidx5.us.i.2.i, align 2
+ %conv6.us.i.2.i = sext i16 %B2 to i32
+ %mul7.us.i.2.i = mul nsw i32 %conv6.us.i.2.i, %conv.us.i.2.i
+ %add9.us.i.2.i = add nsw i32 %mul7.us.i.2.i, %A5
+ %inc.us.i.2.i = or i32 %j.026.us.i.i, 1
+ %add.us.i.3.i = add i32 %j.026.us.i.i, %mul.us.i118.3.i
+ %arrayidx4.us.i.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.i
+ %B3 = load i16, i16* %arrayidx4.us.i.3.i, align 2
+ %conv.us.i.3.i = sext i16 %B3 to i32
+ %arrayidx5.us.i.3.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i
+ %B4 = load i16, i16* %arrayidx5.us.i.3.i, align 2
+ %conv6.us.i.3.i = sext i16 %B4 to i32
+ %mul7.us.i.3.i = mul nsw i32 %conv6.us.i.3.i, %conv.us.i.3.i
+ %add9.us.i.3.i = add nsw i32 %mul7.us.i.3.i, %A6
+ %inc.us.i.3.i = or i32 %j.026.us.i.i, 1
+ %add.us.i.1337.i = add i32 %inc.us.i.i, %mul.us.i118.i
+ %arrayidx4.us.i.1338.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1337.i
+ %B5 = load i16, i16* %arrayidx4.us.i.1338.i, align 2
+ %conv.us.i.1339.i = sext i16 %B5 to i32
+ %arrayidx5.us.i.1340.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.i
+ %B6 = load i16, i16* %arrayidx5.us.i.1340.i, align 2
+ %conv6.us.i.1341.i = sext i16 %B6 to i32
+ %mul7.us.i.1342.i = mul nsw i32 %conv6.us.i.1341.i, %conv.us.i.1339.i
+ %add9.us.i.1343.i = add nsw i32 %mul7.us.i.1342.i, %add9.us.i.i
+ %inc.us.i.1344.i = or i32 %j.026.us.i.i, 2
+ %add.us.i.1.1.i = add i32 %inc.us.i.1.i, %mul.us.i118.1.i
+ %arrayidx4.us.i.1.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.1.i
+ %B7 = load i16, i16* %arrayidx4.us.i.1.1.i, align 2
+ %conv.us.i.1.1.i = sext i16 %B7 to i32
+ %arrayidx5.us.i.1.1.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1.i
+ %B6.dup = load i16, i16* %arrayidx5.us.i.1.1.i, align 2
+ %conv6.us.i.1.1.i = sext i16 %B6.dup to i32
+ %mul7.us.i.1.1.i = mul nsw i32 %conv6.us.i.1.1.i, %conv.us.i.1.1.i
+ %add9.us.i.1.1.i = add nsw i32 %mul7.us.i.1.1.i, %add9.us.i.1.i
+ %inc.us.i.1.1.i = or i32 %j.026.us.i.i, 2
+ %add.us.i.2.1.i = add i32 %inc.us.i.2.i, %mul.us.i118.2.i
+ %arrayidx4.us.i.2.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.1.i
+ %B9 = load i16, i16* %arrayidx4.us.i.2.1.i, align 2
+ %conv.us.i.2.1.i = sext i16 %B9 to i32
+ %arrayidx5.us.i.2.1.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2.i
+ %B6.dup.i = load i16, i16* %arrayidx5.us.i.2.1.i, align 2
+ %conv6.us.i.2.1.i = sext i16 %B6.dup.i to i32
+ %mul7.us.i.2.1.i = mul nsw i32 %conv6.us.i.2.1.i, %conv.us.i.2.1.i
+ %add9.us.i.2.1.i = add nsw i32 %mul7.us.i.2.1.i, %add9.us.i.2.i
+ %inc.us.i.2.1.i = or i32 %j.026.us.i.i, 2
+ %add.us.i.3.1.i = add i32 %inc.us.i.3.i, %mul.us.i118.3.i
+ %arrayidx4.us.i.3.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.1.i
+ %B11 = load i16, i16* %arrayidx4.us.i.3.1.i, align 2
+ %conv.us.i.3.1.i = sext i16 %B11 to i32
+ %arrayidx5.us.i.3.1.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.3.i
+ %B6.dup.i.i = load i16, i16* %arrayidx5.us.i.3.1.i, align 2
+ %conv6.us.i.3.1.i = sext i16 %B6.dup.i.i to i32
+ %mul7.us.i.3.1.i = mul nsw i32 %conv6.us.i.3.1.i, %conv.us.i.3.1.i
+ %add9.us.i.3.1.i = add nsw i32 %mul7.us.i.3.1.i, %add9.us.i.3.i
+ %inc.us.i.3.1.i = or i32 %j.026.us.i.i, 2
+ %add.us.i.2346.i = add i32 %inc.us.i.1344.i, %mul.us.i118.i
+ %arrayidx4.us.i.2347.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2346.i
+ %B13 = load i16, i16* %arrayidx4.us.i.2347.i, align 2
+ %conv.us.i.2348.i = sext i16 %B13 to i32
+ %arrayidx5.us.i.2349.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1344.i
+ %B14 = load i16, i16* %arrayidx5.us.i.2349.i, align 2
+ %conv6.us.i.2350.i = sext i16 %B14 to i32
+ %mul7.us.i.2351.i = mul nsw i32 %conv6.us.i.2350.i, %conv.us.i.2348.i
+ %add9.us.i.2352.i = add nsw i32 %mul7.us.i.2351.i, %add9.us.i.1343.i
+ %inc.us.i.2353.i = or i32 %j.026.us.i.i, 3
+ %add.us.i.1.2.i = add i32 %inc.us.i.1.1.i, %mul.us.i118.1.i
+ %arrayidx4.us.i.1.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.2.i
+ %B15 = load i16, i16* %arrayidx4.us.i.1.2.i, align 2
+ %conv.us.i.1.2.i = sext i16 %B15 to i32
+ %arrayidx5.us.i.1.2.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1.1.i
+ %B14.dup = load i16, i16* %arrayidx5.us.i.1.2.i, align 2
+ %conv6.us.i.1.2.i = sext i16 %B14.dup to i32
+ %mul7.us.i.1.2.i = mul nsw i32 %conv6.us.i.1.2.i, %conv.us.i.1.2.i
+ %add9.us.i.1.2.i = add nsw i32 %mul7.us.i.1.2.i, %add9.us.i.1.1.i
+ %inc.us.i.1.2.i = or i32 %j.026.us.i.i, 3
+ %add.us.i.2.2.i = add i32 %inc.us.i.2.1.i, %mul.us.i118.2.i
+ %arrayidx4.us.i.2.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.2.i
+ %B17 = load i16, i16* %arrayidx4.us.i.2.2.i, align 2
+ %conv.us.i.2.2.i = sext i16 %B17 to i32
+ %arrayidx5.us.i.2.2.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2.1.i
+ %B14.dup.i = load i16, i16* %arrayidx5.us.i.2.2.i, align 2
+ %conv6.us.i.2.2.i = sext i16 %B14.dup.i to i32
+ %mul7.us.i.2.2.i = mul nsw i32 %conv6.us.i.2.2.i, %conv.us.i.2.2.i
+ %add9.us.i.2.2.i = add nsw i32 %mul7.us.i.2.2.i, %add9.us.i.2.1.i
+ %inc.us.i.2.2.i = or i32 %j.026.us.i.i, 3
+ %add.us.i.3.2.i = add i32 %inc.us.i.3.1.i, %mul.us.i118.3.i
+ %arrayidx4.us.i.3.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.2.i
+ %B19 = load i16, i16* %arrayidx4.us.i.3.2.i, align 2
+ %conv.us.i.3.2.i = sext i16 %B19 to i32
+ %arrayidx5.us.i.3.2.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.3.1.i
+ %B14.dup.i.i = load i16, i16* %arrayidx5.us.i.3.2.i, align 2
+ %conv6.us.i.3.2.i = sext i16 %B14.dup.i.i to i32
+ %mul7.us.i.3.2.i = mul nsw i32 %conv6.us.i.3.2.i, %conv.us.i.3.2.i
+ %add9.us.i.3.2.i = add nsw i32 %mul7.us.i.3.2.i, %add9.us.i.3.1.i
+ %inc.us.i.3.2.i = or i32 %j.026.us.i.i, 3
+ %add.us.i.3355.i = add i32 %inc.us.i.2353.i, %mul.us.i118.i
+ %arrayidx4.us.i.3356.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3355.i
+ %B21 = load i16, i16* %arrayidx4.us.i.3356.i, align 2
+ %conv.us.i.3357.i = sext i16 %B21 to i32
+ %arrayidx5.us.i.3358.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2353.i
+ %B22 = load i16, i16* %arrayidx5.us.i.3358.i, align 2
+ %conv6.us.i.3359.i = sext i16 %B22 to i32
+ %mul7.us.i.3360.i = mul nsw i32 %conv6.us.i.3359.i, %conv.us.i.3357.i
+ %add9.us.i.3361.i = add nsw i32 %mul7.us.i.3360.i, %add9.us.i.2352.i
+ %inc.us.i.3362.i = add i32 %j.026.us.i.i, 4
+ %add.us.i.1.3.i = add i32 %inc.us.i.1.2.i, %mul.us.i118.1.i
+ %arrayidx4.us.i.1.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.3.i
+ %B23 = load i16, i16* %arrayidx4.us.i.1.3.i, align 2
+ %conv.us.i.1.3.i = sext i16 %B23 to i32
+ %arrayidx5.us.i.1.3.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1.2.i
+ %B22.dup = load i16, i16* %arrayidx5.us.i.1.3.i, align 2
+ %conv6.us.i.1.3.i = sext i16 %B22.dup to i32
+ %mul7.us.i.1.3.i = mul nsw i32 %conv6.us.i.1.3.i, %conv.us.i.1.3.i
+ %add9.us.i.1.3.i = add nsw i32 %mul7.us.i.1.3.i, %add9.us.i.1.2.i
+ %add.us.i.2.3.i = add i32 %inc.us.i.2.2.i, %mul.us.i118.2.i
+ %arrayidx4.us.i.2.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.3.i
+ %B25 = load i16, i16* %arrayidx4.us.i.2.3.i, align 2
+ %conv.us.i.2.3.i = sext i16 %B25 to i32
+ %arrayidx5.us.i.2.3.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2.2.i
+ %B22.dup.i = load i16, i16* %arrayidx5.us.i.2.3.i, align 2
+ %conv6.us.i.2.3.i = sext i16 %B22.dup.i to i32
+ %mul7.us.i.2.3.i = mul nsw i32 %conv6.us.i.2.3.i, %conv.us.i.2.3.i
+ %add9.us.i.2.3.i = add nsw i32 %mul7.us.i.2.3.i, %add9.us.i.2.2.i
+ %add.us.i.3.3.i = add i32 %inc.us.i.3.2.i, %mul.us.i118.3.i
+ %arrayidx4.us.i.3.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.3.i
+ %B27 = load i16, i16* %arrayidx4.us.i.3.3.i, align 2
+ %conv.us.i.3.3.i = sext i16 %B27 to i32
+ %arrayidx5.us.i.3.3.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.3.2.i
+ %B22.dup.i.i = load i16, i16* %arrayidx5.us.i.3.3.i, align 2
+ %conv6.us.i.3.3.i = sext i16 %B22.dup.i.i to i32
+ %mul7.us.i.3.3.i = mul nsw i32 %conv6.us.i.3.3.i, %conv.us.i.3.3.i
+ %add9.us.i.3.3.i = add nsw i32 %mul7.us.i.3.3.i, %add9.us.i.3.2.i
+ %niter335.nsub.3.i = add i32 %niter335.i, -4
+ %niter335.ncmp.3.i = icmp eq i32 %niter335.nsub.3.i, 0
+ br i1 %niter335.ncmp.3.i, label %exit, label %for.body
+
+exit:
+ %arrayidx.out.i = getelementptr inbounds i32, i32* %res, i32 0
+ store i32 %add9.us.i.3361.i, i32* %arrayidx.out.i, align 4
+ %arrayidx.out.1.i = getelementptr inbounds i32, i32* %res, i32 1
+ store i32 %add9.us.i.1.3.i, i32* %arrayidx.out.1.i, align 4
+ %arrayidx.out.2.i = getelementptr inbounds i32, i32* %res, i32 2
+ store i32 %add9.us.i.2.3.i, i32* %arrayidx.out.2.i, align 4
+ %arrayidx.out.3.i = getelementptr inbounds i32, i32* %res, i32 3
+ store i32 %add9.us.i.3.3.i, i32* %arrayidx.out.3.i, align 4
+ ret void
+}