Instruction *Root;
ValueList AllValues;
MemInstList VecLd; // List of all load instructions.
- MemLocList MemLocs; // All memory locations read by this tree.
+ MemInstList Loads;
bool ReadOnly = true;
OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
virtual ~OpChain() = default;
- void SetMemoryLocations() {
- const auto Size = LocationSize::unknown();
+ void PopulateLoads() {
for (auto *V : AllValues) {
- if (auto *I = dyn_cast<Instruction>(V)) {
- if (I->mayWriteToMemory())
- ReadOnly = false;
- if (auto *Ld = dyn_cast<LoadInst>(V))
- MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
- }
+ if (auto *Ld = dyn_cast<LoadInst>(V))
+ Loads.push_back(Ld);
}
}
std::map<LoadInst*, LoadInst*> LoadPairs;
std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
- bool RecordSequentialLoads(BasicBlock *BB);
+ bool RecordMemoryOps(BasicBlock *BB);
bool InsertParallelMACs(Reduction &Reduction);
bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
- LoadInst* CreateLoadIns(IRBuilder<NoFolder> &IRB,
- SmallVectorImpl<LoadInst*> &Loads,
- IntegerType *LoadTy);
+ LoadInst* CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
+ IntegerType *LoadTy);
void CreateParallelMACPairs(Reduction &R);
Instruction *CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
SmallVectorImpl<LoadInst*> &VecLd1,
ARMParallelDSP() : LoopPass(ID) { }
+ bool doInitialization(Loop *L, LPPassManager &LPM) override {
+ LoadPairs.clear();
+ WideLoads.clear();
+ return true;
+ }
+
void getAnalysisUsage(AnalysisUsage &AU) const override {
LoopPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();
if (!ST->isLittle()) {
LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
- "ARMParallelDSP\n");
+ << "ARMParallelDSP\n");
return false;
}
LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
- if (!RecordSequentialLoads(Header)) {
+ if (!RecordMemoryOps(Header)) {
LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
return false;
}
return true;
}
-/// Iterate through the block and record base, offset pairs of loads as well as
-/// maximal sequences of sequential loads.
-bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *BB) {
+/// Iterate through the block and record base, offset pairs of loads which can
+/// be widened into a single load.
+bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
SmallVector<LoadInst*, 8> Loads;
+ SmallVector<Instruction*, 8> Writes;
+
+ // Collect loads and instruction that may write to memory. For now we only
+ // record loads which are simple, sign-extended and have a single user.
+ // TODO: Allow zero-extended loads.
for (auto &I : *BB) {
+ if (I.mayWriteToMemory())
+ Writes.push_back(&I);
auto *Ld = dyn_cast<LoadInst>(&I);
if (!Ld || !Ld->isSimple() ||
!Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
Loads.push_back(Ld);
}
- for (auto *Ld0 : Loads) {
- for (auto *Ld1 : Loads) {
- if (Ld0 == Ld1)
+ using InstSet = std::set<Instruction*>;
+ using DepMap = std::map<Instruction*, InstSet>;
+ DepMap RAWDeps;
+
+ // Record any writes that may alias a load.
+ const auto Size = LocationSize::unknown();
+ for (auto Read : Loads) {
+ for (auto Write : Writes) {
+ MemoryLocation ReadLoc =
+ MemoryLocation(Read->getPointerOperand(), Size);
+
+ if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
+ ModRefInfo::ModRef)))
continue;
+ if (DT->dominates(Write, Read))
+ RAWDeps[Read].insert(Write);
+ }
+ }
- if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
- LoadPairs[Ld0] = Ld1;
+ // Check whether there's not a write between the two loads which would
+ // prevent them from being safely merged.
+ auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
+ LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset;
+ LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base;
+
+ if (RAWDeps.count(Dominated)) {
+ InstSet &WritesBefore = RAWDeps[Dominated];
+
+ for (auto Before : WritesBefore) {
+
+ // We can't move the second load backward, past a write, to merge
+ // with the first load.
+ if (DT->dominates(Dominator, Before))
+ return false;
+ }
+ }
+ return true;
+ };
+
+ // Record base, offset load pairs.
+ for (auto *Base : Loads) {
+ for (auto *Offset : Loads) {
+ if (Base == Offset)
+ continue;
+
+ if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) &&
+ SafeToPair(Base, Offset)) {
+ LoadPairs[Base] = Offset;
break;
}
}
for (auto &Pair : Reduction.PMACPairs) {
BinOpChain *PMul0 = Pair.first;
BinOpChain *PMul1 = Pair.second;
- LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
- dbgs() << "- "; PMul0->Root->dump();
- dbgs() << "- "; PMul1->Root->dump());
+ LLVM_DEBUG(dbgs() << "Found parallel MACs:\n"
+ << "- " << *PMul0->Root << "\n"
+ << "- " << *PMul1->Root << "\n");
Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange,
InsertAfter);
return false;
}
-static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
- ReductionList &Reductions) {
- RecurrenceDescriptor RecDesc;
- const bool HasFnNoNaNAttr =
- F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
- const BasicBlock *Latch = TheLoop->getLoopLatch();
-
- for (PHINode &Phi : Header->phis()) {
- const auto *Ty = Phi.getType();
- if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
- continue;
-
- const bool IsReduction =
- RecurrenceDescriptor::AddReductionVar(&Phi,
- RecurrenceDescriptor::RK_IntegerAdd,
- TheLoop, HasFnNoNaNAttr, RecDesc);
- if (!IsReduction)
- continue;
-
- Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
- if (!Acc)
- continue;
-
- Reductions.push_back(Reduction(&Phi, Acc));
- }
-
- LLVM_DEBUG(
- dbgs() << "\nAccumulating integer additions (reductions) found:\n";
- for (auto &R : Reductions) {
- dbgs() << "- "; R.Phi->dump();
- dbgs() << "-> "; R.AccIntAdd->dump();
- }
- );
-}
-
-static void AddMACCandidate(OpChainList &Candidates,
- Instruction *Mul,
- Value *MulOp0, Value *MulOp1) {
- assert(Mul->getOpcode() == Instruction::Mul &&
- "expected mul instruction");
- ValueList LHS;
- ValueList RHS;
- if (IsNarrowSequence<16>(MulOp0, LHS) &&
- IsNarrowSequence<16>(MulOp1, RHS)) {
- Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
- }
-}
-
static void MatchParallelMACSequences(Reduction &R,
OpChainList &Candidates) {
Instruction *Acc = R.AccIntAdd;
case Instruction::Mul: {
Value *MulOp0 = I->getOperand(0);
Value *MulOp1 = I->getOperand(1);
- if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1))
- AddMACCandidate(Candidates, I, MulOp0, MulOp1);
+ if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) {
+ ValueList LHS;
+ ValueList RHS;
+ if (IsNarrowSequence<16>(MulOp0, LHS) &&
+ IsNarrowSequence<16>(MulOp1, RHS)) {
+ Candidates.push_back(make_unique<BinOpChain>(I, LHS, RHS));
+ }
+ }
return false;
}
case Instruction::SExt:
<< Candidates.size() << " candidates.\n");
}
-// Collects all instructions that are not part of the MAC chains, which is the
-// set of instructions that can potentially alias with the MAC operands.
-static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
- Instructions &Writes) {
- for (auto &I : *Header) {
- if (I.mayReadFromMemory())
- Reads.push_back(&I);
- if (I.mayWriteToMemory())
- Writes.push_back(&I);
- }
-}
-
-// Check whether statements in the basic block that write to memory alias with
-// the memory locations accessed by the MAC-chains.
-// TODO: we need the read statements when we accept more complicated chains.
-static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
- Instructions &Writes, OpChainList &MACCandidates) {
- LLVM_DEBUG(dbgs() << "Alias checks:\n");
- for (auto &MAC : MACCandidates) {
- LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
-
- // At the moment, we allow only simple chains that only consist of reads,
- // accumulate their result with an integer add, and thus that don't write
- // memory, and simply bail if they do.
- if (!MAC->ReadOnly)
- return true;
-
- // Now for all writes in the basic block, check that they don't alias with
- // the memory locations accessed by our MAC-chain:
- for (auto *I : Writes) {
- LLVM_DEBUG(dbgs() << "- "; I->dump());
- assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
- for (auto &MemLoc : MAC->MemLocs) {
- if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
- ModRefInfo::ModRef))) {
- LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
- return true;
- }
- }
- }
- }
-
- LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
- return false;
-}
-
static bool CheckMACMemory(OpChainList &Candidates) {
for (auto &C : Candidates) {
// A mul has 2 operands, and a narrow op consist of sext and a load; thus
LLVM_DEBUG(dbgs() << "Operand list too short.\n");
return false;
}
- C->SetMemoryLocations();
+ C->PopulateLoads();
ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
// before the loop begins.
//
bool ARMParallelDSP::MatchSMLAD(Function &F) {
- BasicBlock *Header = L->getHeader();
- LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
- dbgs() << "Header block:\n"; Header->dump();
- dbgs() << "Loop info:\n\n"; L->dump());
- bool Changed = false;
+ auto FindReductions = [&](ReductionList &Reductions) {
+ RecurrenceDescriptor RecDesc;
+ const bool HasFnNoNaNAttr =
+ F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+ BasicBlock *Latch = L->getLoopLatch();
+
+ for (PHINode &Phi : Latch->phis()) {
+ const auto *Ty = Phi.getType();
+ if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
+ continue;
+
+ const bool IsReduction = RecurrenceDescriptor::AddReductionVar(
+ &Phi, RecurrenceDescriptor::RK_IntegerAdd, L, HasFnNoNaNAttr, RecDesc);
+
+ if (!IsReduction)
+ continue;
+
+ Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
+ if (!Acc)
+ continue;
+
+ Reductions.push_back(Reduction(&Phi, Acc));
+ }
+ return !Reductions.empty();
+ };
+
ReductionList Reductions;
- MatchReductions(F, L, Header, Reductions);
+ if (!FindReductions(Reductions))
+ return false;
for (auto &R : Reductions) {
OpChainList MACCandidates;
dbgs() << "\n";);
}
- // Collect all instructions that may read or write memory. Our alias
- // analysis checks bail out if any of these instructions aliases with an
- // instruction from the MAC-chain.
- Instructions Reads, Writes;
- AliasCandidates(Header, Reads, Writes);
-
+ bool Changed = false;
+ // Check whether statements in the basic block that write to memory alias
+ // with the memory locations accessed by the MAC-chains.
for (auto &R : Reductions) {
- if (AreAliased(AA, Reads, Writes, R.MACCandidates))
- return false;
CreateParallelMACPairs(R);
Changed |= InsertParallelMACs(R);
}
- LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
return Changed;
}
-LoadInst* ARMParallelDSP::CreateLoadIns(IRBuilder<NoFolder> &IRB,
- SmallVectorImpl<LoadInst*> &Loads,
- IntegerType *LoadTy) {
+LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
+ IntegerType *LoadTy) {
assert(Loads.size() == 2 && "currently only support widening two loads");
-
- const unsigned AddrSpace = Loads[0]->getPointerAddressSpace();
- Value *VecPtr = IRB.CreateBitCast(Loads[0]->getPointerOperand(),
+
+ LoadInst *Base = Loads[0];
+ LoadInst *Offset = Loads[1];
+
+ Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
+ Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
+
+ assert((BaseSExt && OffsetSExt)
+ && "Loads should have a single, extending, user");
+
+ std::function<void(Value*, Value*)> MoveBefore =
+ [&](Value *A, Value *B) -> void {
+ if (!isa<Instruction>(A) || !isa<Instruction>(B))
+ return;
+
+ auto *Source = cast<Instruction>(A);
+ auto *Sink = cast<Instruction>(B);
+
+ if (DT->dominates(Source, Sink) ||
+ Source->getParent() != Sink->getParent() ||
+ isa<PHINode>(Source) || isa<PHINode>(Sink))
+ return;
+
+ Source->moveBefore(Sink);
+ for (auto &U : Source->uses())
+ MoveBefore(Source, U.getUser());
+ };
+
+ // Insert the load at the point of the original dominating load.
+ LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
+ IRBuilder<NoFolder> IRB(DomLoad->getParent(),
+ ++BasicBlock::iterator(DomLoad));
+
+ // Bitcast the pointer to a wider type and create the wide load, while making
+ // sure to maintain the original alignment as this prevents ldrd from being
+ // generated when it could be illegal due to memory alignment.
+ const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
+ Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
LoadTy->getPointerTo(AddrSpace));
LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
- Loads[0]->getAlignment());
- // Fix up users, Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
- Instruction *SExt0 = dyn_cast<SExtInst>(Loads[0]->user_back());
- Instruction *SExt1 = dyn_cast<SExtInst>(Loads[1]->user_back());
-
- assert((Loads[0]->hasOneUse() && Loads[1]->hasOneUse() && SExt0 && SExt1) &&
- "Loads should have a single, extending, user");
-
- std::function<void(Instruction*, Instruction*)> MoveAfter =
- [&](Instruction* Source, Instruction* Sink) -> void {
- if (DT->dominates(Source, Sink) ||
- Source->getParent() != Sink->getParent() ||
- isa<PHINode>(Source) || isa<PHINode>(Sink))
- return;
-
- Sink->moveAfter(Source);
- for (auto &U : Sink->uses())
- MoveAfter(Sink, cast<Instruction>(U.getUser()));
- };
+ Base->getAlignment());
+
+ // Make sure everything is in the correct order in the basic block.
+ MoveBefore(Base->getPointerOperand(), VecPtr);
+ MoveBefore(VecPtr, WideLoad);
// From the wide load, create two values that equal the original two loads.
- Value *Bottom = IRB.CreateTrunc(WideLoad, Loads[0]->getType());
- SExt0->setOperand(0, Bottom);
- if (auto *I = dyn_cast<Instruction>(Bottom)) {
- I->moveAfter(WideLoad);
- MoveAfter(I, SExt0);
- }
+ // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
+ // TODO: Support big-endian as well.
+ Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
+ BaseSExt->setOperand(0, Bottom);
- IntegerType *Ld1Ty = cast<IntegerType>(Loads[1]->getType());
- Value *ShiftVal = ConstantInt::get(LoadTy, Ld1Ty->getBitWidth());
+ IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
+ Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
- if (auto *I = dyn_cast<Instruction>(Top))
- MoveAfter(WideLoad, I);
+ Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
+ OffsetSExt->setOperand(0, Trunc);
- Value *Trunc = IRB.CreateTrunc(Top, Ld1Ty);
- SExt1->setOperand(0, Trunc);
- if (auto *I = dyn_cast<Instruction>(Trunc))
- MoveAfter(I, SExt1);
-
- WideLoads.emplace(std::make_pair(Loads[0],
+ WideLoads.emplace(std::make_pair(Base,
make_unique<WidenedLoad>(Loads, WideLoad)));
return WideLoad;
}
<< "- " << *Acc << "\n"
<< "- Exchange: " << Exchange << "\n");
- IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
- ++BasicBlock::iterator(InsertAfter));
-
// Replace the reduction chain with an intrinsic call
IntegerType *Ty = IntegerType::get(M->getContext(), 32);
LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ?
- WideLoads[VecLd0[0]]->getLoad() : CreateLoadIns(Builder, VecLd0, Ty);
+ WideLoads[VecLd0[0]]->getLoad() : CreateWideLoad(VecLd0, Ty);
LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ?
- WideLoads[VecLd1[0]]->getLoad() : CreateLoadIns(Builder, VecLd1, Ty);
+ WideLoads[VecLd1[0]]->getLoad() : CreateWideLoad(VecLd1, Ty);
+
Value* Args[] = { WideLd0, WideLd1, Acc };
Function *SMLAD = nullptr;
if (Exchange)
SMLAD = Acc->getType()->isIntegerTy(32) ?
Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
+
+ IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+ ++BasicBlock::iterator(InsertAfter));
CallInst *Call = Builder.CreateCall(SMLAD, Args);
NumSMLAD++;
return Call;
--- /dev/null
+; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -verify -S | FileCheck %s
+;
+; Alias check: check that the rewrite isn't triggered when there's a store
+; instruction possibly aliasing any mul load operands; arguments are passed
+; without 'restrict' enabled.
+;
+; CHECK-NOT: call i32 @llvm.arm.smlad
+;
+define dso_local i32 @no_restrict(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ ret i32 %mac1.0.lcssa
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+
+; Store inserted here, aliasing with arrayidx, arrayidx1, arrayidx3
+ store i16 42, i16* %arrayidx, align 2
+
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; Alias check: check that the rewrite isn't triggered when there's a store
+; aliasing one of the mul load operands. Arguments are now annotated with
+; 'noalias'.
+;
+; CHECK-NOT: call i32 @llvm.arm.smlad
+;
+define dso_local i32 @restrict(i32 %arg, i32* noalias %arg1, i16* noalias readonly %arg2, i16* noalias readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ ret i32 %mac1.0.lcssa
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+
+; Store inserted here, aliasing only with loads from 'arrayidx'.
+ store i16 42, i16* %arrayidx, align 2
+
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+
+; Here the Mul is the LHS, and the Add the RHS.
+ %add11 = add i32 %mul9, %add10
+
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_dominates_all
+; CHECK: store
+; CHECK: load
+; CHECK: load
+; CHECK: load
+; CHECK: load
+; CHECK: smlad
+define dso_local i32 @store_dominates_all(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ ret i32 %mac1.0.lcssa
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ store i16 42, i16* %arrayidx, align 2
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: loads_dominate
+; CHECK-NOT: store
+; CHECK: load i32
+; CHECK-NOT: store
+; CHECK: load i32
+; CHECK: store
+define dso_local i32 @loads_dominate(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ ret i32 %mac1.0.lcssa
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ store i16 42, i16* %arrayidx, align 2
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_alias_arg3_legal_1
+; CHECK-NOT: store
+; CHECK: phi i32
+; CHECK: [[IV:%[^ ]+]] = phi i32 [ %add
+; CHECK: [[ARG3_GEP:%[^ ]+]] = getelementptr inbounds i16, i16* %arg3, i32 [[IV]]
+; CHECK: [[ARG3:%[^ ]+]] = bitcast i16* [[ARG3_GEP]] to i32*
+; CHECK: load i32, i32* [[ARG3]]
+; CHECK: [[ARG2_GEP:%[^ ]+]] = getelementptr inbounds i16, i16* %arg2, i32 [[IV]]
+; CHECK: [[ARG2:%[^ ]+]] = bitcast i16* [[ARG2_GEP]] to i32*
+; CHECK: load i32, i32* [[ARG2]]
+; CHECK: store
+define dso_local i32 @store_alias_arg3_legal_1(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ ret i32 %mac1.0.lcssa
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ store i16 42, i16* %arrayidx, align 2
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_alias_arg3_legal_2
+; CHECK-NOT: store
+; CHECK: [[BITCAST:[^ ]+]] = bitcast i16* %arrayidx to i32*
+; CHECK: load i32, i32* [[BITCAST]]
+; CHECK: store i16 42, i16* %arrayidx
+; CHECK: [[BITCAST3:[^ ]+]] = bitcast i16* %arrayidx3 to i32*
+; CHECK: load i32, i32* [[BITCAST3]]
+; CHECK: smlad
+define dso_local i32 @store_alias_arg3_legal_2(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ ret i32 %mac1.0.lcssa
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ store i16 42, i16* %arrayidx, align 2
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_alias_arg3_illegal_1
+; CHECK-NOT: load i32
+define dso_local i32 @store_alias_arg3_illegal_1(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* noalias nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ ret i32 %mac1.0.lcssa
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ store i16 42, i16* %arrayidx1, align 2
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_alias_arg3_illegal_2
+; CHECK-NOT: load i32
+define dso_local i32 @store_alias_arg3_illegal_2(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* noalias nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ ret i32 %mac1.0.lcssa
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ store i16 42, i16* %arrayidx, align 2
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_alias_arg2_illegal_1
+; CHECK-NOT: load i32
+define dso_local i32 @store_alias_arg2_illegal_1(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ ret i32 %mac1.0.lcssa
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ store i16 42, i16* %arrayidx6, align 2
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: store_alias_arg2_illegal_2
+; CHECK-NOT: load i32
+define dso_local i32 @store_alias_arg2_illegal_2(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ ret i32 %mac1.0.lcssa
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ store i16 42, i16* %arrayidx3, align 2
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: one_pair_alias
+; FIXME: This tests shows we have a bug with smlad insertion
+define i32 @one_pair_alias(i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c) {
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret i32 %add26
+
+for.body: ; preds = %for.body, %entry
+ %i.050 = phi i32 [ 0, %entry ], [ %add27, %for.body ]
+ %a.049 = phi i32 [ 0, %entry ], [ %add26, %for.body ]
+ %add3 = or i32 %i.050, 1
+ %add11 = or i32 %i.050, 2
+ %add19 = or i32 %i.050, 3
+ %arrayidx = getelementptr inbounds i16, i16* %b, i32 %i.050
+ %arrayidx4 = getelementptr inbounds i16, i16* %b, i32 %add3
+ %arrayidx12 = getelementptr inbounds i16, i16* %b, i32 %add11
+ %arrayidx20 = getelementptr inbounds i16, i16* %b, i32 %add19
+ %arrayidx1 = getelementptr inbounds i16, i16* %c, i32 %i.050
+ %arrayidx7 = getelementptr inbounds i16, i16* %c, i32 %add3
+ %arrayidx15 = getelementptr inbounds i16, i16* %c, i32 %add11
+ %arrayidx23 = getelementptr inbounds i16, i16* %c, i32 %add19
+ %tmp = load i16, i16* %arrayidx, align 2
+ %tmp2 = load i16, i16* %arrayidx4, align 2
+ %tmp4 = load i16, i16* %arrayidx12, align 2
+ %tmp6 = load i16, i16* %arrayidx20, align 2
+ %tmp1 = load i16, i16* %arrayidx1, align 2
+ store i16 43, i16 *%arrayidx7
+ %tmp3 = load i16, i16* %arrayidx7, align 2
+ %tmp5 = load i16, i16* %arrayidx15, align 2
+ %tmp7 = load i16, i16* %arrayidx23, align 2
+ %conv = sext i16 %tmp to i32
+ %conv2 = sext i16 %tmp1 to i32
+ %mul = mul nsw i32 %conv2, %conv
+ %add = add nsw i32 %mul, %a.049
+ %conv5 = sext i16 %tmp2 to i32
+ %conv8 = sext i16 %tmp3 to i32
+ %mul9 = mul nsw i32 %conv8, %conv5
+ %add10 = add nsw i32 %add, %mul9
+ %conv13 = sext i16 %tmp4 to i32
+ %conv16 = sext i16 %tmp5 to i32
+ %mul17 = mul nsw i32 %conv16, %conv13
+ %add18 = add nsw i32 %add10, %mul17
+ %conv21 = sext i16 %tmp6 to i32
+ %conv24 = sext i16 %tmp7 to i32
+ %mul25 = mul nsw i32 %conv24, %conv21
+ %add26 = add nsw i32 %add18, %mul25
+ %add27 = add nuw nsw i32 %i.050, 4
+ %cmp = icmp ult i32 %add27, 100
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
;
; CHECK-LABEL: @OneReduction
; CHECK: %mac1{{\.}}026 = phi i32 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
-; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
+; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx to i32*
; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
-; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32*
+; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
; CHECK: [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2
-; CHECK: [[V8]] = call i32 @llvm.arm.smlad(i32 [[V5]], i32 [[V7]], i32 %mac1{{\.}}026)
+; CHECK: [[V8]] = call i32 @llvm.arm.smlad(i32 [[V7]], i32 [[V5]], i32 %mac1{{\.}}026)
; CHECK-NOT: call i32 @llvm.arm.smlad
;
; CHECK-UNSUPPORTED-NOT: call i32 @llvm.arm.smlad
; CHECK-LABEL: @test1
; CHECK: %mac1{{\.}}026 = phi i32 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
-; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
+; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx to i32*
; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
-; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32*
+; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
; CHECK: [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2
-; CHECK: [[V8]] = call i32 @llvm.arm.smlad(i32 [[V5]], i32 [[V7]], i32 %mac1{{\.}}026)
+; CHECK: [[V8]] = call i32 @llvm.arm.smlad(i32 [[V7]], i32 [[V5]], i32 %mac1{{\.}}026)
define dso_local i32 @test1(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
entry:
; A more complicated chain: 4 mul operations, so we expect 2 smlad calls.
;
; CHECK: %mac1{{\.}}054 = phi i32 [ [[V17:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
-; CHECK: [[V8:%[0-9]+]] = bitcast i16* %arrayidx8 to i32*
-; CHECK: [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2
; CHECK: [[V10:%[0-9]+]] = bitcast i16* %arrayidx to i32*
; CHECK: [[V11:%[0-9]+]] = load i32, i32* [[V10]], align 2
-; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054)
-; CHECK: [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32*
-; CHECK: [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2
; CHECK: [[V15:%[0-9]+]] = bitcast i16* %arrayidx4 to i32*
; CHECK: [[V16:%[0-9]+]] = load i32, i32* [[V15]], align 2
+; CHECK: [[V8:%[0-9]+]] = bitcast i16* %arrayidx8 to i32*
+; CHECK: [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2
+; CHECK: [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32*
+; CHECK: [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2
+; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054)
; CHECK: [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 [[V12]])
;
; And we don't want to see a 3rd smlad:
+++ /dev/null
-; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s
-;
-; Alias check: check that the rewrite isn't triggered when there's a store
-; instruction possibly aliasing any mul load operands; arguments are passed
-; without 'restrict' enabled.
-;
-; CHECK-NOT: call i32 @llvm.arm.smlad
-;
-define dso_local i32 @test(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
-entry:
- %cmp24 = icmp sgt i32 %arg, 0
- br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
- %.pre = load i16, i16* %arg3, align 2
- %.pre27 = load i16, i16* %arg2, align 2
- br label %for.body
-
-for.cond.cleanup:
- %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
- ret i32 %mac1.0.lcssa
-
-for.body:
- %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
- %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
- %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
- %0 = load i16, i16* %arrayidx, align 2
-
-; Store inserted here, aliasing with arrayidx, arrayidx1, arrayidx3
- store i16 42, i16* %arrayidx, align 2
-
- %add = add nuw nsw i32 %i.025, 1
- %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
- %1 = load i16, i16* %arrayidx1, align 2
- %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
- %2 = load i16, i16* %arrayidx3, align 2
- %conv = sext i16 %2 to i32
- %conv4 = sext i16 %0 to i32
- %mul = mul nsw i32 %conv, %conv4
- %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
- %3 = load i16, i16* %arrayidx6, align 2
- %conv7 = sext i16 %3 to i32
- %conv8 = sext i16 %1 to i32
- %mul9 = mul nsw i32 %conv7, %conv8
- %add10 = add i32 %mul, %mac1.026
- %add11 = add i32 %mul9, %add10
- %exitcond = icmp ne i32 %add, %arg
- br i1 %exitcond, label %for.body, label %for.cond.cleanup
-}
-
+++ /dev/null
-; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s
-;
-; Alias check: check that the rewrite isn't triggered when there's a store
-; aliasing one of the mul load operands. Arguments are now annotated with
-; 'noalias'.
-;
-; CHECK-NOT: call i32 @llvm.arm.smlad
-;
-define dso_local i32 @test(i32 %arg, i32* noalias %arg1, i16* noalias readonly %arg2, i16* noalias readonly %arg3) {
-entry:
- %cmp24 = icmp sgt i32 %arg, 0
- br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
- %.pre = load i16, i16* %arg3, align 2
- %.pre27 = load i16, i16* %arg2, align 2
- br label %for.body
-
-for.cond.cleanup:
- %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
- ret i32 %mac1.0.lcssa
-
-for.body:
- %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
- %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
- %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
- %0 = load i16, i16* %arrayidx, align 2
-
-; Store inserted here, aliasing only with loads from 'arrayidx'.
- store i16 42, i16* %arrayidx, align 2
-
- %add = add nuw nsw i32 %i.025, 1
- %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
- %1 = load i16, i16* %arrayidx1, align 2
- %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
- %2 = load i16, i16* %arrayidx3, align 2
- %conv = sext i16 %2 to i32
- %conv4 = sext i16 %0 to i32
- %mul = mul nsw i32 %conv, %conv4
- %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
- %3 = load i16, i16* %arrayidx6, align 2
- %conv7 = sext i16 %3 to i32
- %conv8 = sext i16 %1 to i32
- %mul9 = mul nsw i32 %conv7, %conv8
- %add10 = add i32 %mul, %mac1.026
-
-; Here the Mul is the LHS, and the Add the RHS.
- %add11 = add i32 %mul9, %add10
-
- %exitcond = icmp ne i32 %add, %arg
- br i1 %exitcond, label %for.body, label %for.cond.cleanup
-}
-
; CHECK-LABEL: smladx
; CHECK: = phi i32 [ 0, %for.body.preheader.new ],
; CHECK: [[ACC0:%[^ ]+]] = phi i32 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ]
+; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32*
+; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
+; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
+; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32*
; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2
; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32*
; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2
; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN23]], i32 [[IN12]], i32 [[ACC0]])
-; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32*
-; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
-; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
-; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN21]], i32 [[IN10]], i32 [[ACC1]])
; CHECK-NOT: call i32 @llvm.arm.smlad
; CHECK-UNSUPPORTED-NOT: call i32 @llvm.arm.smlad
; CHECK: [[IV:%[^ ]+]] = phi i32
; CHECK: [[ACC0:%[^ ]+]] = phi i32 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ]
-; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2
-; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
+; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32*
+; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2
+; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2
+; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*
+; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2
+; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32*
; CHECK: [[IN2_2:%[^ ]+]] = load i32, i32* [[PIN2_2_CAST]], align 2
+
; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*
; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2
-; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC0]])
-; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32*
-; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2
-; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*
-; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2
+; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC0]])
; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN2]], i32 [[IN1_2]], i32 [[ACC1]])
; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4
;
; CHECK-LABEL: @OneReduction
; CHECK: %mac1{{\.}}026 = phi i64 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
-; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
-; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32*
; CHECK: [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2
+; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
+; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
; CHECK: [[V8]] = call i64 @llvm.arm.smlald(i32 [[V5]], i32 [[V7]], i64 %mac1{{\.}}026)
; CHECK-NOT: call i64 @llvm.arm.smlald
;
; CHECK-LABEL: @test1
; CHECK: %mac1{{\.}}026 = phi i64 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
-; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
-; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32*
; CHECK: [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2
+; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
+; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
; CHECK: [[V8]] = call i64 @llvm.arm.smlald(i32 [[V5]], i32 [[V7]], i64 %mac1{{\.}}026)
define dso_local i64 @test1(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
;
; CHECK-LABEL: @OneReduction
; CHECK: %mac1{{\.}}026 = phi i64 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
-; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
-; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32*
; CHECK: [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2
+; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
+; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
; CHECK: [[V8]] = call i64 @llvm.arm.smlald(i32 [[V5]], i32 [[V7]], i64 %mac1{{\.}}026)
; CHECK-NOT: call i64 @llvm.arm.smlald
;
; CHECK-LABEL: smlaldx
; CHECK: = phi i32 [ 0, %for.body.preheader.new ],
; CHECK: [[ACC0:%[^ ]+]] = phi i64 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ]
+; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32*
+; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
+; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
+; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32*
; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2
; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32*
; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2
; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]])
-; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32*
-; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
-; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
-; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]])
; CHECK-NOT: call i64 @llvm.arm.smlad
; CHECK-UNSUPPORTED-NOT: call i64 @llvm.arm.smlad
; CHECK: [[IV:%[^ ]+]] = phi i32
; CHECK: [[ACC0:%[^ ]+]] = phi i64 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ]
+; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32*
+; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2
+
; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2
-; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
+; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*
+; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2
+; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32*
; CHECK: [[IN2_2:%[^ ]+]] = load i32, i32* [[PIN2_2_CAST]], align 2
+
; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*
; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2
-; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]])
-; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32*
-; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2
-; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*
-; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2
+; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]])
; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]])
; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4
; CHECK-LABEL: smlaldx
; CHECK: = phi i32 [ 0, %for.body.preheader.new ],
; CHECK: [[ACC0:%[^ ]+]] = phi i64 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ]
+; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32*
+; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
+; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
+; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32*
; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2
; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32*
; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2
; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]])
-; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32*
-; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2
-; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*
-; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2
; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]])
; CHECK-NOT: call i64 @llvm.arm.smlad
; CHECK-UNSUPPORTED-NOT: call i64 @llvm.arm.smlad
; CHECK: [[PIN1:%[^ ]+]] = phi i16* [ [[PIN1_NEXT:%[^ ]+]], %for.body ], [ [[PIN1Base]], %for.body.preheader.new ]
; CHECK: [[IV:%[^ ]+]] = phi i32
; CHECK: [[ACC0:%[^ ]+]] = phi i64 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ]
-; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2
-; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32*
; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2
+
+; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2
; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*
; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2
-; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]])
-; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*
-; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2
+; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2
; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32*
; CHECK: [[IN2_2:%[^ ]+]] = load i32, i32* [[PIN2_2_CAST]], align 2
+
+; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*
+; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2
+
+; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]])
; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN1]], i32 [[IN2_2]], i64 [[ACC1]])
; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4