From: Nico Weber Date: Wed, 21 Aug 2019 19:53:42 +0000 (+0000) Subject: Revert r367389 (and follow-up r368404); it caused PR43073. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=59699b910dbb15a923283efb28a688a99e3c3699;p=llvm Revert r367389 (and follow-up r368404); it caused PR43073. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@369567 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp index 1f15ddb85d4..5717a7102b6 100644 --- a/lib/Target/ARM/ARMParallelDSP.cpp +++ b/lib/Target/ARM/ARMParallelDSP.cpp @@ -1,4 +1,4 @@ -//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===// +//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -18,10 +18,13 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/NoFolder.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/PassSupport.h" @@ -68,7 +71,7 @@ namespace { } LoadInst *getBaseLoad() const { - return VecLd.front(); + return cast(LHS); } }; @@ -155,11 +158,13 @@ namespace { } }; - class ARMParallelDSP : public FunctionPass { + class ARMParallelDSP : public LoopPass { ScalarEvolution *SE; AliasAnalysis *AA; TargetLibraryInfo *TLI; DominatorTree *DT; + LoopInfo *LI; + Loop *L; const DataLayout *DL; Module *M; std::map LoadPairs; @@ -180,38 +185,63 @@ namespace { /// products to a 32-bit accumulate operand. Optionally, the instruction can /// exchange the halfwords of the second operand before performing the /// arithmetic. - bool MatchSMLAD(Function &F); + bool MatchSMLAD(Loop *L); public: static char ID; - ARMParallelDSP() : FunctionPass(ID) { } + ARMParallelDSP() : LoopPass(ID) { } + + bool doInitialization(Loop *L, LPPassManager &LPM) override { + LoadPairs.clear(); + WideLoads.clear(); + return true; + } void getAnalysisUsage(AnalysisUsage &AU) const override { - FunctionPass::getAnalysisUsage(AU); + LoopPass::getAnalysisUsage(AU); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addPreserved(); - AU.addPreserved(); + AU.addPreserved(); AU.setPreservesCFG(); } - bool runOnFunction(Function &F) override { + bool runOnLoop(Loop *TheLoop, LPPassManager &) override { if (DisableParallelDSP) return false; - if (skipFunction(F)) + if (skipLoop(TheLoop)) return false; + L = TheLoop; SE = &getAnalysis().getSE(); AA = &getAnalysis().getAAResults(); TLI = &getAnalysis().getTLI(); DT = &getAnalysis().getDomTree(); + LI = &getAnalysis().getLoopInfo(); auto &TPC = getAnalysis(); + BasicBlock *Header = TheLoop->getHeader(); + if (!Header) + return false; + + // TODO: We assume the loop header and latch to be the same block. + // This is not a fundamental restriction, but lifting this would just + // require more work to do the transformation and then patch up the CFG. + if (Header != TheLoop->getLoopLatch()) { + LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not " + "running pass ARMParallelDSP\n"); + return false; + } + + if (!TheLoop->getLoopPreheader()) + InsertPreheaderForLoop(L, DT, LI, nullptr, true); + + Function &F = *Header->getParent(); M = F.getParent(); DL = &M->getDataLayout(); @@ -236,10 +266,17 @@ namespace { return false; } + LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI); + LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n"); LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n"); - bool Changes = MatchSMLAD(F); + if (!RecordMemoryOps(Header)) { + LLVM_DEBUG(dbgs() << " - No sequential loads found.\n"); + return false; + } + + bool Changes = MatchSMLAD(L); return Changes; } }; @@ -300,8 +337,6 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, Value *&Src) { bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { SmallVector Loads; SmallVector Writes; - LoadPairs.clear(); - WideLoads.clear(); // Collect loads and instruction that may write to memory. For now we only // record loads which are simple, sign-extended and have a single user. @@ -379,7 +414,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { return LoadPairs.size() > 1; } -// The pass needs to identify integer add/sub reductions of 16-bit vector +// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector // multiplications. // To use SMLAD: // 1) we first need to find integer add then look for this pattern: @@ -410,13 +445,13 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { // If loop invariants are used instead of loads, these need to be packed // before the loop begins. // -bool ARMParallelDSP::MatchSMLAD(Function &F) { +bool ARMParallelDSP::MatchSMLAD(Loop *L) { // Search recursively back through the operands to find a tree of values that // form a multiply-accumulate chain. The search records the Add and Mul // instructions that form the reduction and allows us to find a single value // to be used as the initial input to the accumlator. - std::function Search = [&] - (Value *V, BasicBlock *BB, Reduction &R) -> bool { + std::function Search = [&] + (Value *V, Reduction &R) -> bool { // If we find a non-instruction, try to use it as the initial accumulator // value. This may have already been found during the search in which case @@ -425,9 +460,6 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) { if (!I) return R.InsertAcc(V); - if (I->getParent() != BB) - return false; - switch (I->getOpcode()) { default: break; @@ -438,8 +470,8 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) { // Adds should be adding together two muls, or another add and a mul to // be within the mac chain. One of the operands may also be the // accumulator value at which point we should stop searching. - bool ValidLHS = Search(I->getOperand(0), BB, R); - bool ValidRHS = Search(I->getOperand(1), BB, R); + bool ValidLHS = Search(I->getOperand(0), R); + bool ValidRHS = Search(I->getOperand(1), R); if (!ValidLHS && !ValidLHS) return false; else if (ValidLHS && ValidRHS) { @@ -465,40 +497,36 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) { return false; } case Instruction::SExt: - return Search(I->getOperand(0), BB, R); + return Search(I->getOperand(0), R); } return false; }; bool Changed = false; + SmallPtrSet AllAdds; + BasicBlock *Latch = L->getLoopLatch(); - for (auto &BB : F) { - SmallPtrSet AllAdds; - if (!RecordMemoryOps(&BB)) + for (Instruction &I : reverse(*Latch)) { + if (I.getOpcode() != Instruction::Add) continue; - for (Instruction &I : reverse(BB)) { - if (I.getOpcode() != Instruction::Add) - continue; - - if (AllAdds.count(&I)) - continue; + if (AllAdds.count(&I)) + continue; - const auto *Ty = I.getType(); - if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) - continue; + const auto *Ty = I.getType(); + if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) + continue; - Reduction R(&I); - if (!Search(&I, &BB, R)) - continue; + Reduction R(&I); + if (!Search(&I, R)) + continue; - if (!CreateParallelPairs(R)) - continue; + if (!CreateParallelPairs(R)) + continue; - InsertParallelMACs(R); - Changed = true; - AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); - } + InsertParallelMACs(R); + Changed = true; + AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); } return Changed; @@ -696,15 +724,13 @@ LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads, // Loads[0] needs trunc while Loads[1] needs a lshr and trunc. // TODO: Support big-endian as well. Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType()); - Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType()); - BaseSExt->replaceAllUsesWith(NewBaseSExt); + BaseSExt->setOperand(0, Bottom); IntegerType *OffsetTy = cast(Offset->getType()); Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth()); Value *Top = IRB.CreateLShr(WideLoad, ShiftVal); Value *Trunc = IRB.CreateTrunc(Top, OffsetTy); - Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType()); - OffsetSExt->replaceAllUsesWith(NewOffsetSExt); + OffsetSExt->setOperand(0, Trunc); WideLoads.emplace(std::make_pair(Base, std::make_unique(Loads, WideLoad))); @@ -718,6 +744,6 @@ Pass *llvm::createARMParallelDSPPass() { char ARMParallelDSP::ID = 0; INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", - "Transform functions to use DSP intrinsics", false, false) + "Transform loops to use DSP intrinsics", false, false) INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp", - "Transform functions to use DSP intrinsics", false, false) + "Transform loops to use DSP intrinsics", false, false) diff --git a/test/CodeGen/ARM/O3-pipeline.ll b/test/CodeGen/ARM/O3-pipeline.ll index c528f5d0cee..ec96f055a05 100644 --- a/test/CodeGen/ARM/O3-pipeline.ll +++ b/test/CodeGen/ARM/O3-pipeline.ll @@ -37,7 +37,8 @@ ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Transform functions to use DSP intrinsics +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: Transform loops to use DSP intrinsics ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: ARM IR optimizations ; CHECK-NEXT: Dominator Tree Construction diff --git a/test/CodeGen/ARM/ParallelDSP/blocks.ll b/test/CodeGen/ARM/ParallelDSP/blocks.ll deleted file mode 100644 index d9dbd960974..00000000000 --- a/test/CodeGen/ARM/ParallelDSP/blocks.ll +++ /dev/null @@ -1,79 +0,0 @@ -; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s - -; CHECK-LABEL: single_block -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc) -define i32 @single_block(i16* %a, i16* %b, i32 %acc) { -entry: - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.a.0, %sext.b.0 - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.1 = load i16, i16* %addr.a.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %mul.1 = mul i32 %sext.a.1, %sext.b.1 - %add = add i32 %mul.0, %mul.1 - %res = add i32 %add, %acc - ret i32 %res -} - -; CHECK-LABEL: multi_block -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0) -define i32 @multi_block(i16* %a, i16* %b, i32 %acc) { -entry: - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.a.0, %sext.b.0 - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.1 = load i16, i16* %addr.a.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %mul.1 = mul i32 %sext.a.1, %sext.b.1 - %add = add i32 %mul.0, %mul.1 - br label %bb.1 - -bb.1: - %res = add i32 %add, %acc - ret i32 %res -} - -; CHECK-LABEL: multi_block_1 -; CHECK-NOT: call i32 @llvm.arm.smlad -define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) { -entry: - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.a.0, %sext.b.0 - br label %bb.1 - -bb.1: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.1 = load i16, i16* %addr.a.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %mul.1 = mul i32 %sext.a.1, %sext.b.1 - %add = add i32 %mul.0, %mul.1 - %res = add i32 %add, %acc - ret i32 %res -} - diff --git a/test/CodeGen/ARM/ParallelDSP/exchange.ll b/test/CodeGen/ARM/ParallelDSP/exchange.ll deleted file mode 100644 index c072df49cdf..00000000000 --- a/test/CodeGen/ARM/ParallelDSP/exchange.ll +++ /dev/null @@ -1,329 +0,0 @@ -; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s - -; CHECK-LABEL: exchange_1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] -define i32 @exchange_1(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.a.0, %sext.b.1 - %mul.1 = mul i32 %sext.a.1, %sext.b.0 - %add = add i32 %mul.0, %mul.1 - %res = add i32 %add, %acc - ret i32 %res -} - -; CHECK-LABEL: exchange_2 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] -define i32 @exchange_2(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.b.1, %sext.a.0 - %mul.1 = mul i32 %sext.b.0, %sext.a.1 - %add = add i32 %mul.0, %mul.1 - %res = add i32 %add, %acc - ret i32 %res -} - -; CHECK-LABEL: exchange_3 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] -define i32 @exchange_3(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.a.0, %sext.b.1 - %mul.1 = mul i32 %sext.a.1, %sext.b.0 - %add = add i32 %mul.1, %mul.0 - %res = add i32 %add, %acc - ret i32 %res -} - -; CHECK-LABEL: exchange_4 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] -define i32 @exchange_4(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.b.1, %sext.a.0 - %mul.1 = mul i32 %sext.b.0, %sext.a.1 - %add = add i32 %mul.1, %mul.0 - %res = add i32 %add, %acc - ret i32 %res -} - -; CHECK-LABEL: exchange_multi_use_1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* -; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]]) -define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.a.0, %sext.b.1 - %mul.1 = mul i32 %sext.a.1, %sext.b.0 - %add = add i32 %mul.0, %mul.1 - %addr.a.2 = getelementptr i16, i16* %a, i32 2 - %addr.a.3 = getelementptr i16, i16* %a, i32 3 - %ld.a.2 = load i16, i16* %addr.a.2 - %ld.a.3 = load i16, i16* %addr.a.3 - %sext.a.2 = sext i16 %ld.a.2 to i32 - %sext.a.3 = sext i16 %ld.a.3 to i32 - %mul.2 = mul i32 %sext.a.3, %sext.b.1 - %mul.3 = mul i32 %sext.a.2, %sext.b.0 - %add.1 = add i32 %mul.2, %mul.3 - %add.2 = add i32 %add, %add.1 - %res = add i32 %add.2, %acc - ret i32 %res -} - -; CHECK-LABEL: exchange_multi_use_2 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* -; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]]) -define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.a.0, %sext.b.0 - %mul.1 = mul i32 %sext.a.1, %sext.b.1 - %add = add i32 %mul.0, %mul.1 - %addr.a.2 = getelementptr i16, i16* %a, i32 2 - %addr.a.3 = getelementptr i16, i16* %a, i32 3 - %ld.a.2 = load i16, i16* %addr.a.2 - %ld.a.3 = load i16, i16* %addr.a.3 - %sext.a.2 = sext i16 %ld.a.2 to i32 - %sext.a.3 = sext i16 %ld.a.3 to i32 - %mul.2 = mul i32 %sext.b.0, %sext.a.3 - %mul.3 = mul i32 %sext.b.1, %sext.a.2 - %add.1 = add i32 %mul.2, %mul.3 - %add.2 = add i32 %add, %add.1 - %res = add i32 %add.2, %acc - ret i32 %res -} - -; TODO: Why aren't two intrinsics generated? -; CHECK-LABEL: exchange_multi_use_3 -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* -; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK-NOT: call i32 @llvm.arm.smlad -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 0 -define i32 @exchange_multi_use_3(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %addr.a.2 = getelementptr i16, i16* %a, i32 2 - %addr.a.3 = getelementptr i16, i16* %a, i32 3 - %ld.a.2 = load i16, i16* %addr.a.2 - %ld.a.3 = load i16, i16* %addr.a.3 - %sext.a.2 = sext i16 %ld.a.2 to i32 - %sext.a.3 = sext i16 %ld.a.3 to i32 - %mul.2 = mul i32 %sext.b.0, %sext.a.3 - %mul.3 = mul i32 %sext.b.1, %sext.a.2 - %mul.0 = mul i32 %sext.a.0, %sext.b.0 - %mul.1 = mul i32 %sext.a.1, %sext.b.1 - %add = add i32 %mul.0, %mul.1 - %add.1 = add i32 %mul.2, %mul.3 - %sub = sub i32 %add, %add.1 - %res = add i32 %acc, %sub - ret i32 %res -} - -; TODO: Why isn't smladx generated too? -; CHECK-LABEL: exchange_multi_use_4 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 0 -; CHECK-NOT: call i32 @llvm.arm.smlad -define i32 @exchange_multi_use_4(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %addr.a.2 = getelementptr i16, i16* %a, i32 2 - %addr.a.3 = getelementptr i16, i16* %a, i32 3 - %ld.a.2 = load i16, i16* %addr.a.2 - %ld.a.3 = load i16, i16* %addr.a.3 - %sext.a.2 = sext i16 %ld.a.2 to i32 - %sext.a.3 = sext i16 %ld.a.3 to i32 - %mul.2 = mul i32 %sext.b.0, %sext.a.3 - %mul.3 = mul i32 %sext.b.1, %sext.a.2 - %mul.0 = mul i32 %sext.a.0, %sext.b.0 - %mul.1 = mul i32 %sext.a.1, %sext.b.1 - %add.1 = add i32 %mul.2, %mul.3 - %add = add i32 %mul.0, %mul.1 - %sub = sub i32 %add, %add.1 - %res = add i32 %acc, %sub - ret i32 %res -} - -; CHECK-LABEL: exchange_swap -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] -define i32 @exchange_swap(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.a.1, %sext.b.0 - %mul.1 = mul i32 %sext.a.0, %sext.b.1 - %add = add i32 %mul.0, %mul.1 - %res = add i32 %add, %acc - ret i32 %res -} - -; CHECK-LABEL: exchange_swap_2 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] -define i32 @exchange_swap_2(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.a.1, %sext.b.0 - %mul.1 = mul i32 %sext.a.0, %sext.b.1 - %add = add i32 %mul.1, %mul.0 - %res = add i32 %add, %acc - ret i32 %res -} - -; CHECK-LABEL: exchange_swap_3 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] -define i32 @exchange_swap_3(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.b.0, %sext.a.1 - %mul.1 = mul i32 %sext.b.1, %sext.a.0 - %add = add i32 %mul.1, %mul.0 - %res = add i32 %add, %acc - ret i32 %res -} diff --git a/test/CodeGen/ARM/ParallelDSP/overlapping.ll b/test/CodeGen/ARM/ParallelDSP/overlapping.ll deleted file mode 100644 index 238f1eb0301..00000000000 --- a/test/CodeGen/ARM/ParallelDSP/overlapping.ll +++ /dev/null @@ -1,161 +0,0 @@ -; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s - -; CHECK-LABEL: overlap_1 -; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 -; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc -define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.a.0, %sext.b.0 - %mul.1 = mul i32 %sext.a.1, %sext.b.1 - %addr.a.2 = getelementptr i16, i16* %a, i32 2 - %addr.b.2 = getelementptr i16, i16* %b, i32 2 - %ld.a.2 = load i16, i16* %addr.a.2 - %ld.b.2 = load i16, i16* %addr.b.2 - %sext.a.2 = sext i16 %ld.a.2 to i32 - %sext.b.2 = sext i16 %ld.b.2 to i32 - %mul.2 = mul i32 %sext.a.2, %sext.b.2 - %add = add i32 %mul.0, %mul.1 - %add.1 = add i32 %mul.1, %mul.2 - %add.2 = add i32 %add.1, %add - %res = add i32 %add.2, %acc - ret i32 %res -} - -; CHECK-LABEL: overlap_2 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc -define i32 @overlap_2(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.a.0, %sext.b.0 - %mul.1 = mul i32 %sext.a.1, %sext.b.1 - %addr.a.2 = getelementptr i16, i16* %a, i32 2 - %addr.b.2 = getelementptr i16, i16* %b, i32 2 - %ld.a.2 = load i16, i16* %addr.a.2 - %ld.b.2 = load i16, i16* %addr.b.2 - %sext.a.2 = sext i16 %ld.a.2 to i32 - %sext.b.2 = sext i16 %ld.b.2 to i32 - %mul.2 = mul i32 %sext.b.2, %sext.a.2 - %add = add i32 %mul.0, %mul.1 - %add.1 = add i32 %mul.1, %mul.2 - %add.2 = add i32 %add, %add.1 - %res = add i32 %add.2, %acc - ret i32 %res -} - -; CHECK-LABEL: overlap_3 -; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* -; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* -; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) -define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.a.0, %sext.b.0 - %mul.1 = mul i32 %sext.a.1, %sext.b.1 - %addr.a.2 = getelementptr i16, i16* %a, i32 2 - %addr.b.2 = getelementptr i16, i16* %b, i32 2 - %addr.a.3 = getelementptr i16, i16* %a, i32 3 - %ld.a.2 = load i16, i16* %addr.a.2 - %ld.b.2 = load i16, i16* %addr.b.2 - %ld.a.3 = load i16, i16* %addr.a.3 - %sext.a.2 = sext i16 %ld.a.2 to i32 - %sext.b.2 = sext i16 %ld.b.2 to i32 - %sext.a.3 = sext i16 %ld.a.3 to i32 - %mul.2 = mul i32 %sext.a.2, %sext.b.1 - %mul.3 = mul i32 %sext.a.3, %sext.b.2 - %add = add i32 %mul.0, %mul.1 - %add.1 = add i32 %mul.2, %mul.3 - %add.2 = add i32 %add.1, %add - %res = add i32 %add.2, %acc - ret i32 %res -} - -; CHECK-LABEL: overlap_4 -; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* -; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* -; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) -define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) { -entry: - %addr.a.1 = getelementptr i16, i16* %a, i32 1 - %addr.b.1 = getelementptr i16, i16* %b, i32 1 - %ld.a.0 = load i16, i16* %a - %sext.a.0 = sext i16 %ld.a.0 to i32 - %ld.b.0 = load i16, i16* %b - %ld.a.1 = load i16, i16* %addr.a.1 - %ld.b.1 = load i16, i16* %addr.b.1 - %sext.a.1 = sext i16 %ld.a.1 to i32 - %sext.b.1 = sext i16 %ld.b.1 to i32 - %sext.b.0 = sext i16 %ld.b.0 to i32 - %mul.0 = mul i32 %sext.a.0, %sext.b.0 - %mul.1 = mul i32 %sext.a.1, %sext.b.1 - %addr.a.2 = getelementptr i16, i16* %a, i32 2 - %addr.b.2 = getelementptr i16, i16* %b, i32 2 - %addr.a.3 = getelementptr i16, i16* %a, i32 3 - %ld.a.2 = load i16, i16* %addr.a.2 - %ld.b.2 = load i16, i16* %addr.b.2 - %ld.a.3 = load i16, i16* %addr.a.3 - %sext.a.2 = sext i16 %ld.a.2 to i32 - %sext.b.2 = sext i16 %ld.b.2 to i32 - %sext.a.3 = sext i16 %ld.a.3 to i32 - %mul.2 = mul i32 %sext.b.2, %sext.a.2 - %mul.3 = mul i32 %sext.b.1, %sext.a.3 - %add = add i32 %mul.0, %mul.1 - %add.1 = add i32 %mul.2, %mul.3 - %add.2 = add i32 %add.1, %add - %res = add i32 %add.2, %acc - ret i32 %res -} diff --git a/test/CodeGen/ARM/ParallelDSP/smlad12.ll b/test/CodeGen/ARM/ParallelDSP/smlad12.ll index 637fc3d3704..d4e09ca3fbb 100644 --- a/test/CodeGen/ARM/ParallelDSP/smlad12.ll +++ b/test/CodeGen/ARM/ParallelDSP/smlad12.ll @@ -2,7 +2,7 @@ ; ; The loop header is not the loop latch. ; -; CHECK: call i32 @llvm.arm.smlad +; CHECK-NOT: call i32 @llvm.arm.smlad ; define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { entry: