-//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
+//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Pass.h"
#include "llvm/PassRegistry.h"
#include "llvm/PassSupport.h"
}
LoadInst *getBaseLoad() const {
- return VecLd.front();
+ return cast<LoadInst>(LHS);
}
};
}
};
- class ARMParallelDSP : public FunctionPass {
+ class ARMParallelDSP : public LoopPass {
ScalarEvolution *SE;
AliasAnalysis *AA;
TargetLibraryInfo *TLI;
DominatorTree *DT;
+ LoopInfo *LI;
+ Loop *L;
const DataLayout *DL;
Module *M;
std::map<LoadInst*, LoadInst*> LoadPairs;
/// products to a 32-bit accumulate operand. Optionally, the instruction can
/// exchange the halfwords of the second operand before performing the
/// arithmetic.
- bool MatchSMLAD(Function &F);
+ bool MatchSMLAD(Loop *L);
public:
static char ID;
- ARMParallelDSP() : FunctionPass(ID) { }
+ ARMParallelDSP() : LoopPass(ID) { }
+
+ bool doInitialization(Loop *L, LPPassManager &LPM) override {
+ LoadPairs.clear();
+ WideLoads.clear();
+ return true;
+ }
void getAnalysisUsage(AnalysisUsage &AU) const override {
- FunctionPass::getAnalysisUsage(AU);
+ LoopPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetPassConfig>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.setPreservesCFG();
}
- bool runOnFunction(Function &F) override {
+ bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
if (DisableParallelDSP)
return false;
- if (skipFunction(F))
+ if (skipLoop(TheLoop))
return false;
+ L = TheLoop;
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &TPC = getAnalysis<TargetPassConfig>();
+ BasicBlock *Header = TheLoop->getHeader();
+ if (!Header)
+ return false;
+
+ // TODO: We assume the loop header and latch to be the same block.
+ // This is not a fundamental restriction, but lifting this would just
+ // require more work to do the transformation and then patch up the CFG.
+ if (Header != TheLoop->getLoopLatch()) {
+ LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
+ "running pass ARMParallelDSP\n");
+ return false;
+ }
+
+ if (!TheLoop->getLoopPreheader())
+ InsertPreheaderForLoop(L, DT, LI, nullptr, true);
+
+ Function &F = *Header->getParent();
M = F.getParent();
DL = &M->getDataLayout();
return false;
}
+ LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
+
LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
- bool Changes = MatchSMLAD(F);
+ if (!RecordMemoryOps(Header)) {
+ LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
+ return false;
+ }
+
+ bool Changes = MatchSMLAD(L);
return Changes;
}
};
bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
SmallVector<LoadInst*, 8> Loads;
SmallVector<Instruction*, 8> Writes;
- LoadPairs.clear();
- WideLoads.clear();
// Collect loads and instruction that may write to memory. For now we only
// record loads which are simple, sign-extended and have a single user.
return LoadPairs.size() > 1;
}
-// The pass needs to identify integer add/sub reductions of 16-bit vector
+// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
// multiplications.
// To use SMLAD:
// 1) we first need to find integer add then look for this pattern:
// If loop invariants are used instead of loads, these need to be packed
// before the loop begins.
//
-bool ARMParallelDSP::MatchSMLAD(Function &F) {
+bool ARMParallelDSP::MatchSMLAD(Loop *L) {
// Search recursively back through the operands to find a tree of values that
// form a multiply-accumulate chain. The search records the Add and Mul
// instructions that form the reduction and allows us to find a single value
// to be used as the initial input to the accumlator.
- std::function<bool(Value*, BasicBlock*, Reduction&)> Search = [&]
- (Value *V, BasicBlock *BB, Reduction &R) -> bool {
+ std::function<bool(Value*, Reduction&)> Search = [&]
+ (Value *V, Reduction &R) -> bool {
// If we find a non-instruction, try to use it as the initial accumulator
// value. This may have already been found during the search in which case
if (!I)
return R.InsertAcc(V);
- if (I->getParent() != BB)
- return false;
-
switch (I->getOpcode()) {
default:
break;
// Adds should be adding together two muls, or another add and a mul to
// be within the mac chain. One of the operands may also be the
// accumulator value at which point we should stop searching.
- bool ValidLHS = Search(I->getOperand(0), BB, R);
- bool ValidRHS = Search(I->getOperand(1), BB, R);
+ bool ValidLHS = Search(I->getOperand(0), R);
+ bool ValidRHS = Search(I->getOperand(1), R);
if (!ValidLHS && !ValidLHS)
return false;
else if (ValidLHS && ValidRHS) {
return false;
}
case Instruction::SExt:
- return Search(I->getOperand(0), BB, R);
+ return Search(I->getOperand(0), R);
}
return false;
};
bool Changed = false;
+ SmallPtrSet<Instruction*, 4> AllAdds;
+ BasicBlock *Latch = L->getLoopLatch();
- for (auto &BB : F) {
- SmallPtrSet<Instruction*, 4> AllAdds;
- if (!RecordMemoryOps(&BB))
+ for (Instruction &I : reverse(*Latch)) {
+ if (I.getOpcode() != Instruction::Add)
continue;
- for (Instruction &I : reverse(BB)) {
- if (I.getOpcode() != Instruction::Add)
- continue;
-
- if (AllAdds.count(&I))
- continue;
+ if (AllAdds.count(&I))
+ continue;
- const auto *Ty = I.getType();
- if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
- continue;
+ const auto *Ty = I.getType();
+ if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
+ continue;
- Reduction R(&I);
- if (!Search(&I, &BB, R))
- continue;
+ Reduction R(&I);
+ if (!Search(&I, R))
+ continue;
- if (!CreateParallelPairs(R))
- continue;
+ if (!CreateParallelPairs(R))
+ continue;
- InsertParallelMACs(R);
- Changed = true;
- AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
- }
+ InsertParallelMACs(R);
+ Changed = true;
+ AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
}
return Changed;
// Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
// TODO: Support big-endian as well.
Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
- Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
- BaseSExt->replaceAllUsesWith(NewBaseSExt);
+ BaseSExt->setOperand(0, Bottom);
IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
- Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
- OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
+ OffsetSExt->setOperand(0, Trunc);
WideLoads.emplace(std::make_pair(Base,
std::make_unique<WidenedLoad>(Loads, WideLoad)));
char ARMParallelDSP::ID = 0;
INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
- "Transform functions to use DSP intrinsics", false, false)
+ "Transform loops to use DSP intrinsics", false, false)
INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
- "Transform functions to use DSP intrinsics", false, false)
+ "Transform loops to use DSP intrinsics", false, false)
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results
-; CHECK-NEXT: Transform functions to use DSP intrinsics
+; CHECK-NEXT: Loop Pass Manager
+; CHECK-NEXT: Transform loops to use DSP intrinsics
; CHECK-NEXT: Interleaved Access Pass
; CHECK-NEXT: ARM IR optimizations
; CHECK-NEXT: Dominator Tree Construction
+++ /dev/null
-; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
-
-; CHECK-LABEL: single_block
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc)
-define i32 @single_block(i16* %a, i16* %b, i32 %acc) {
-entry:
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.a.0, %sext.b.0
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.1 = load i16, i16* %addr.a.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %mul.1 = mul i32 %sext.a.1, %sext.b.1
- %add = add i32 %mul.0, %mul.1
- %res = add i32 %add, %acc
- ret i32 %res
-}
-
-; CHECK-LABEL: multi_block
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0)
-define i32 @multi_block(i16* %a, i16* %b, i32 %acc) {
-entry:
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.a.0, %sext.b.0
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.1 = load i16, i16* %addr.a.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %mul.1 = mul i32 %sext.a.1, %sext.b.1
- %add = add i32 %mul.0, %mul.1
- br label %bb.1
-
-bb.1:
- %res = add i32 %add, %acc
- ret i32 %res
-}
-
-; CHECK-LABEL: multi_block_1
-; CHECK-NOT: call i32 @llvm.arm.smlad
-define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) {
-entry:
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.a.0, %sext.b.0
- br label %bb.1
-
-bb.1:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.1 = load i16, i16* %addr.a.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %mul.1 = mul i32 %sext.a.1, %sext.b.1
- %add = add i32 %mul.0, %mul.1
- %res = add i32 %add, %acc
- ret i32 %res
-}
-
+++ /dev/null
-; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
-
-; CHECK-LABEL: exchange_1
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
-define i32 @exchange_1(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.a.0, %sext.b.1
- %mul.1 = mul i32 %sext.a.1, %sext.b.0
- %add = add i32 %mul.0, %mul.1
- %res = add i32 %add, %acc
- ret i32 %res
-}
-
-; CHECK-LABEL: exchange_2
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
-define i32 @exchange_2(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.b.1, %sext.a.0
- %mul.1 = mul i32 %sext.b.0, %sext.a.1
- %add = add i32 %mul.0, %mul.1
- %res = add i32 %add, %acc
- ret i32 %res
-}
-
-; CHECK-LABEL: exchange_3
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
-define i32 @exchange_3(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.a.0, %sext.b.1
- %mul.1 = mul i32 %sext.a.1, %sext.b.0
- %add = add i32 %mul.1, %mul.0
- %res = add i32 %add, %acc
- ret i32 %res
-}
-
-; CHECK-LABEL: exchange_4
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
-define i32 @exchange_4(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.b.1, %sext.a.0
- %mul.1 = mul i32 %sext.b.0, %sext.a.1
- %add = add i32 %mul.1, %mul.0
- %res = add i32 %add, %acc
- ret i32 %res
-}
-
-; CHECK-LABEL: exchange_multi_use_1
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
-; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
-; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
-; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
-; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]])
-define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.a.0, %sext.b.1
- %mul.1 = mul i32 %sext.a.1, %sext.b.0
- %add = add i32 %mul.0, %mul.1
- %addr.a.2 = getelementptr i16, i16* %a, i32 2
- %addr.a.3 = getelementptr i16, i16* %a, i32 3
- %ld.a.2 = load i16, i16* %addr.a.2
- %ld.a.3 = load i16, i16* %addr.a.3
- %sext.a.2 = sext i16 %ld.a.2 to i32
- %sext.a.3 = sext i16 %ld.a.3 to i32
- %mul.2 = mul i32 %sext.a.3, %sext.b.1
- %mul.3 = mul i32 %sext.a.2, %sext.b.0
- %add.1 = add i32 %mul.2, %mul.3
- %add.2 = add i32 %add, %add.1
- %res = add i32 %add.2, %acc
- ret i32 %res
-}
-
-; CHECK-LABEL: exchange_multi_use_2
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
-; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
-; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
-; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
-; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]])
-define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.a.0, %sext.b.0
- %mul.1 = mul i32 %sext.a.1, %sext.b.1
- %add = add i32 %mul.0, %mul.1
- %addr.a.2 = getelementptr i16, i16* %a, i32 2
- %addr.a.3 = getelementptr i16, i16* %a, i32 3
- %ld.a.2 = load i16, i16* %addr.a.2
- %ld.a.3 = load i16, i16* %addr.a.3
- %sext.a.2 = sext i16 %ld.a.2 to i32
- %sext.a.3 = sext i16 %ld.a.3 to i32
- %mul.2 = mul i32 %sext.b.0, %sext.a.3
- %mul.3 = mul i32 %sext.b.1, %sext.a.2
- %add.1 = add i32 %mul.2, %mul.3
- %add.2 = add i32 %add, %add.1
- %res = add i32 %add.2, %acc
- ret i32 %res
-}
-
-; TODO: Why aren't two intrinsics generated?
-; CHECK-LABEL: exchange_multi_use_3
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
-; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
-; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
-; CHECK-NOT: call i32 @llvm.arm.smlad
-; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 0
-define i32 @exchange_multi_use_3(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %addr.a.2 = getelementptr i16, i16* %a, i32 2
- %addr.a.3 = getelementptr i16, i16* %a, i32 3
- %ld.a.2 = load i16, i16* %addr.a.2
- %ld.a.3 = load i16, i16* %addr.a.3
- %sext.a.2 = sext i16 %ld.a.2 to i32
- %sext.a.3 = sext i16 %ld.a.3 to i32
- %mul.2 = mul i32 %sext.b.0, %sext.a.3
- %mul.3 = mul i32 %sext.b.1, %sext.a.2
- %mul.0 = mul i32 %sext.a.0, %sext.b.0
- %mul.1 = mul i32 %sext.a.1, %sext.b.1
- %add = add i32 %mul.0, %mul.1
- %add.1 = add i32 %mul.2, %mul.3
- %sub = sub i32 %add, %add.1
- %res = add i32 %acc, %sub
- ret i32 %res
-}
-
-; TODO: Why isn't smladx generated too?
-; CHECK-LABEL: exchange_multi_use_4
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 0
-; CHECK-NOT: call i32 @llvm.arm.smlad
-define i32 @exchange_multi_use_4(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %addr.a.2 = getelementptr i16, i16* %a, i32 2
- %addr.a.3 = getelementptr i16, i16* %a, i32 3
- %ld.a.2 = load i16, i16* %addr.a.2
- %ld.a.3 = load i16, i16* %addr.a.3
- %sext.a.2 = sext i16 %ld.a.2 to i32
- %sext.a.3 = sext i16 %ld.a.3 to i32
- %mul.2 = mul i32 %sext.b.0, %sext.a.3
- %mul.3 = mul i32 %sext.b.1, %sext.a.2
- %mul.0 = mul i32 %sext.a.0, %sext.b.0
- %mul.1 = mul i32 %sext.a.1, %sext.b.1
- %add.1 = add i32 %mul.2, %mul.3
- %add = add i32 %mul.0, %mul.1
- %sub = sub i32 %add, %add.1
- %res = add i32 %acc, %sub
- ret i32 %res
-}
-
-; CHECK-LABEL: exchange_swap
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
-define i32 @exchange_swap(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.a.1, %sext.b.0
- %mul.1 = mul i32 %sext.a.0, %sext.b.1
- %add = add i32 %mul.0, %mul.1
- %res = add i32 %add, %acc
- ret i32 %res
-}
-
-; CHECK-LABEL: exchange_swap_2
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
-define i32 @exchange_swap_2(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.a.1, %sext.b.0
- %mul.1 = mul i32 %sext.a.0, %sext.b.1
- %add = add i32 %mul.1, %mul.0
- %res = add i32 %add, %acc
- ret i32 %res
-}
-
-; CHECK-LABEL: exchange_swap_3
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
-define i32 @exchange_swap_3(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.b.0, %sext.a.1
- %mul.1 = mul i32 %sext.b.1, %sext.a.0
- %add = add i32 %mul.1, %mul.0
- %res = add i32 %add, %acc
- ret i32 %res
-}
+++ /dev/null
-; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
-
-; CHECK-LABEL: overlap_1
-; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 1
-; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
-define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.a.0, %sext.b.0
- %mul.1 = mul i32 %sext.a.1, %sext.b.1
- %addr.a.2 = getelementptr i16, i16* %a, i32 2
- %addr.b.2 = getelementptr i16, i16* %b, i32 2
- %ld.a.2 = load i16, i16* %addr.a.2
- %ld.b.2 = load i16, i16* %addr.b.2
- %sext.a.2 = sext i16 %ld.a.2 to i32
- %sext.b.2 = sext i16 %ld.b.2 to i32
- %mul.2 = mul i32 %sext.a.2, %sext.b.2
- %add = add i32 %mul.0, %mul.1
- %add.1 = add i32 %mul.1, %mul.2
- %add.2 = add i32 %add.1, %add
- %res = add i32 %add.2, %acc
- ret i32 %res
-}
-
-; CHECK-LABEL: overlap_2
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
-define i32 @overlap_2(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.a.0, %sext.b.0
- %mul.1 = mul i32 %sext.a.1, %sext.b.1
- %addr.a.2 = getelementptr i16, i16* %a, i32 2
- %addr.b.2 = getelementptr i16, i16* %b, i32 2
- %ld.a.2 = load i16, i16* %addr.a.2
- %ld.b.2 = load i16, i16* %addr.b.2
- %sext.a.2 = sext i16 %ld.a.2 to i32
- %sext.b.2 = sext i16 %ld.b.2 to i32
- %mul.2 = mul i32 %sext.b.2, %sext.a.2
- %add = add i32 %mul.0, %mul.1
- %add.1 = add i32 %mul.1, %mul.2
- %add.2 = add i32 %add, %add.1
- %res = add i32 %add.2, %acc
- ret i32 %res
-}
-
-; CHECK-LABEL: overlap_3
-; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
-; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
-; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
-; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
-; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
-; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc)
-; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]])
-define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.a.0, %sext.b.0
- %mul.1 = mul i32 %sext.a.1, %sext.b.1
- %addr.a.2 = getelementptr i16, i16* %a, i32 2
- %addr.b.2 = getelementptr i16, i16* %b, i32 2
- %addr.a.3 = getelementptr i16, i16* %a, i32 3
- %ld.a.2 = load i16, i16* %addr.a.2
- %ld.b.2 = load i16, i16* %addr.b.2
- %ld.a.3 = load i16, i16* %addr.a.3
- %sext.a.2 = sext i16 %ld.a.2 to i32
- %sext.b.2 = sext i16 %ld.b.2 to i32
- %sext.a.3 = sext i16 %ld.a.3 to i32
- %mul.2 = mul i32 %sext.a.2, %sext.b.1
- %mul.3 = mul i32 %sext.a.3, %sext.b.2
- %add = add i32 %mul.0, %mul.1
- %add.1 = add i32 %mul.2, %mul.3
- %add.2 = add i32 %add.1, %add
- %res = add i32 %add.2, %acc
- ret i32 %res
-}
-
-; CHECK-LABEL: overlap_4
-; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
-; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
-; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
-; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
-; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
-; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
-; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
-; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
-; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
-; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc)
-; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]])
-define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) {
-entry:
- %addr.a.1 = getelementptr i16, i16* %a, i32 1
- %addr.b.1 = getelementptr i16, i16* %b, i32 1
- %ld.a.0 = load i16, i16* %a
- %sext.a.0 = sext i16 %ld.a.0 to i32
- %ld.b.0 = load i16, i16* %b
- %ld.a.1 = load i16, i16* %addr.a.1
- %ld.b.1 = load i16, i16* %addr.b.1
- %sext.a.1 = sext i16 %ld.a.1 to i32
- %sext.b.1 = sext i16 %ld.b.1 to i32
- %sext.b.0 = sext i16 %ld.b.0 to i32
- %mul.0 = mul i32 %sext.a.0, %sext.b.0
- %mul.1 = mul i32 %sext.a.1, %sext.b.1
- %addr.a.2 = getelementptr i16, i16* %a, i32 2
- %addr.b.2 = getelementptr i16, i16* %b, i32 2
- %addr.a.3 = getelementptr i16, i16* %a, i32 3
- %ld.a.2 = load i16, i16* %addr.a.2
- %ld.b.2 = load i16, i16* %addr.b.2
- %ld.a.3 = load i16, i16* %addr.a.3
- %sext.a.2 = sext i16 %ld.a.2 to i32
- %sext.b.2 = sext i16 %ld.b.2 to i32
- %sext.a.3 = sext i16 %ld.a.3 to i32
- %mul.2 = mul i32 %sext.b.2, %sext.a.2
- %mul.3 = mul i32 %sext.b.1, %sext.a.3
- %add = add i32 %mul.0, %mul.1
- %add.1 = add i32 %mul.2, %mul.3
- %add.2 = add i32 %add.1, %add
- %res = add i32 %add.2, %acc
- ret i32 %res
-}
;
; The loop header is not the loop latch.
;
-; CHECK: call i32 @llvm.arm.smlad
+; CHECK-NOT: call i32 @llvm.arm.smlad
;
define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
entry: