FunctionPass *createA15SDOptimizerPass();
FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
FunctionPass *createARMExpandPseudoPass();
+FunctionPass *createARMCodeGenPreparePass();
FunctionPass *createARMConstantIslandPass();
FunctionPass *createMLxExpansionPass();
FunctionPass *createThumb2ITBlockPass();
void initializeARMParallelDSPPass(PassRegistry &);
void initializeARMLoadStoreOptPass(PassRegistry &);
void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+void initializeARMCodeGenPreparePass(PassRegistry &);
void initializeARMConstantIslandsPass(PassRegistry &);
void initializeARMExpandPseudoPass(PassRegistry &);
void initializeThumb2SizeReducePass(PassRegistry &);
--- /dev/null
+//===----- ARMCodeGenPrepare.cpp ------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass inserts intrinsics to handle small types that would otherwise be
+/// promoted during legalization. Here we can manually promote types or insert
+/// intrinsics which can handle narrow types that aren't supported by the
+/// register classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+
+#define DEBUG_TYPE "arm-codegenprepare"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DisableCGP("arm-disable-cgp", cl::Hidden, cl::init(false),
+ cl::desc("Disable ARM specific CodeGenPrepare pass"));
+
+static cl::opt<bool>
+EnableDSP("arm-enable-scalar-dsp", cl::Hidden, cl::init(false),
+ cl::desc("Use DSP instructions for scalar operations"));
+
+static cl::opt<bool>
+EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false),
+ cl::desc("Use DSP instructions for scalar operations\
+ with immediate operands"));
+
+namespace {
+
+class IRPromoter {
+ SmallPtrSet<Value*, 8> NewInsts;
+ SmallVector<Instruction*, 4> InstsToRemove;
+ Module *M = nullptr;
+ LLVMContext &Ctx;
+
+public:
+ IRPromoter(Module *M) : M(M), Ctx(M->getContext()) { }
+
+ void Cleanup() {
+ for (auto *I : InstsToRemove) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
+ I->dropAllReferences();
+ I->eraseFromParent();
+ }
+ InstsToRemove.clear();
+ NewInsts.clear();
+ }
+
+ void Mutate(Type *OrigTy,
+ SmallPtrSetImpl<Value*> &Visited,
+ SmallPtrSetImpl<Value*> &Leaves,
+ SmallPtrSetImpl<Instruction*> &Roots);
+};
+
+class ARMCodeGenPrepare : public FunctionPass {
+ const ARMSubtarget *ST = nullptr;
+ IRPromoter *Promoter = nullptr;
+ std::set<Value*> AllVisited;
+ Type *OrigTy = nullptr;
+ unsigned TypeSize = 0;
+
+ bool isNarrowInstSupported(Instruction *I);
+ bool isSupportedValue(Value *V);
+ bool isLegalToPromote(Value *V);
+ bool TryToPromote(Value *V);
+
+public:
+ static char ID;
+
+ ARMCodeGenPrepare() : FunctionPass(ID) {}
+
+ ~ARMCodeGenPrepare() { delete Promoter; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ }
+
+ StringRef getPassName() const override { return "ARM IR optimizations"; }
+
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+};
+
+}
+
+/// Can the given value generate sign bits.
+static bool isSigned(Value *V) {
+ if (!isa<Instruction>(V))
+ return false;
+
+ unsigned Opc = cast<Instruction>(V)->getOpcode();
+ return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
+ Opc == Instruction::SRem;
+}
+
+/// Some instructions can use 8- and 16-bit operands, and we don't need to
+/// promote anything larger. We disallow booleans to make life easier when
+/// dealing with icmps but allow any other integer that is <= 16 bits. Void
+/// types are accepted so we can handle switches.
+static bool isSupportedType(Value *V) {
+ if (V->getType()->isVoidTy())
+ return true;
+
+ const IntegerType *IntTy = dyn_cast<IntegerType>(V->getType());
+ if (!IntTy)
+ return false;
+
+ // Don't try to promote boolean values.
+ if (IntTy->getBitWidth() == 1)
+ return false;
+
+ if (auto *ZExt = dyn_cast<ZExtInst>(V))
+ return isSupportedType(ZExt->getOperand(0));
+
+ return IntTy->getBitWidth() <= 16;
+}
+
+/// Return true if V will require any promoted values to be truncated for the
+/// use to be valid.
+static bool isSink(Value *V) {
+ auto UsesNarrowValue = [](Value *V) {
+ return V->getType()->getScalarSizeInBits() <= 32;
+ };
+
+ if (auto *Store = dyn_cast<StoreInst>(V))
+ return UsesNarrowValue(Store->getValueOperand());
+ if (auto *Return = dyn_cast<ReturnInst>(V))
+ return UsesNarrowValue(Return->getReturnValue());
+
+ return isa<CallInst>(V);
+}
+
+/// Return true if the given value is a leaf that will need to be zext'd.
+static bool isSource(Value *V) {
+ if (isa<Argument>(V) && isSupportedType(V))
+ return true;
+ else if (isa<TruncInst>(V))
+ return true;
+ else if (auto *ZExt = dyn_cast<ZExtInst>(V))
+ // ZExt can be a leaf if its the only user of a load.
+ return isa<LoadInst>(ZExt->getOperand(0)) &&
+ ZExt->getOperand(0)->hasOneUse();
+ else if (auto *Call = dyn_cast<CallInst>(V))
+ return Call->hasRetAttr(Attribute::AttrKind::ZExt);
+ else if (auto *Load = dyn_cast<LoadInst>(V)) {
+ if (!isa<IntegerType>(Load->getType()))
+ return false;
+ // A load is a leaf, unless its already just being zext'd.
+ if (Load->hasOneUse() && isa<ZExtInst>(*Load->use_begin()))
+ return false;
+
+ return true;
+ }
+ return false;
+}
+
+/// Return whether the instruction can be promoted within any modifications to
+/// it's operands or result.
+static bool isSafeOverflow(Instruction *I) {
+ if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
+ return true;
+
+ unsigned Opc = I->getOpcode();
+ if (Opc == Instruction::Add || Opc == Instruction::Sub) {
+ // We don't care if the add or sub could wrap if the value is decreasing
+ // and is only being used by an unsigned compare.
+ if (!I->hasOneUse() ||
+ !isa<ICmpInst>(*I->user_begin()) ||
+ !isa<ConstantInt>(I->getOperand(1)))
+ return false;
+
+ auto *CI = cast<ICmpInst>(*I->user_begin());
+ if (CI->isSigned())
+ return false;
+
+ bool NegImm = cast<ConstantInt>(I->getOperand(1))->isNegative();
+ bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) ||
+ ((Opc == Instruction::Add) && NegImm);
+ if (!IsDecreasing)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n");
+ return true;
+ }
+
+ // Otherwise, if an instruction is using a negative immediate we will need
+ // to fix it up during the promotion.
+ for (auto &Op : I->operands()) {
+ if (auto *Const = dyn_cast<ConstantInt>(Op))
+ if (Const->isNegative())
+ return false;
+ }
+ return false;
+}
+
+static bool shouldPromote(Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ if (!isa<IntegerType>(V->getType()))
+ return false;
+
+ if (isa<StoreInst>(I) || isa<TerminatorInst>(I) || isa<TruncInst>(I) ||
+ isa<ICmpInst>(I))
+ return false;
+
+ if (auto *ZExt = dyn_cast<ZExtInst>(I))
+ return !ZExt->getDestTy()->isIntegerTy(32);
+
+ return true;
+}
+
+/// Return whether we can safely mutate V's type to ExtTy without having to be
+/// concerned with zero extending or truncation.
+static bool isPromotedResultSafe(Value *V) {
+ if (!isa<Instruction>(V))
+ return true;
+
+ if (isSigned(V))
+ return false;
+
+ // If I is only being used by something that will require its value to be
+ // truncated, then we don't care about the promoted result.
+ auto *I = cast<Instruction>(V);
+ if (I->hasOneUse() && isSink(*I->use_begin()))
+ return true;
+
+ if (isa<OverflowingBinaryOperator>(I))
+ return isSafeOverflow(I);
+ return true;
+}
+
+/// Return the intrinsic for the instruction that can perform the same
+/// operation but on a narrow type. This is using the parallel dsp intrinsics
+/// on scalar values.
+static Intrinsic::ID getNarrowIntrinsic(Instruction *I, unsigned TypeSize) {
+ // Whether we use the signed or unsigned versions of these intrinsics
+ // doesn't matter because we're not using the GE bits that they set in
+ // the APSR.
+ switch(I->getOpcode()) {
+ default:
+ break;
+ case Instruction::Add:
+ return TypeSize == 16 ? Intrinsic::arm_uadd16 :
+ Intrinsic::arm_uadd8;
+ case Instruction::Sub:
+ return TypeSize == 16 ? Intrinsic::arm_usub16 :
+ Intrinsic::arm_usub8;
+ }
+ llvm_unreachable("unhandled opcode for narrow intrinsic");
+}
+
+void IRPromoter::Mutate(Type *OrigTy,
+ SmallPtrSetImpl<Value*> &Visited,
+ SmallPtrSetImpl<Value*> &Leaves,
+ SmallPtrSetImpl<Instruction*> &Roots) {
+ IRBuilder<> Builder{Ctx};
+ Type *ExtTy = Type::getInt32Ty(M->getContext());
+ unsigned TypeSize = OrigTy->getPrimitiveSizeInBits();
+ SmallPtrSet<Value*, 8> Promoted;
+ LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " << TypeSize
+ << " to 32-bits\n");
+
+ auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) {
+ SmallVector<Instruction*, 4> Users;
+ Instruction *InstTo = dyn_cast<Instruction>(To);
+ for (Use &U : From->uses()) {
+ auto *User = cast<Instruction>(U.getUser());
+ if (InstTo && User->isIdenticalTo(InstTo))
+ continue;
+ Users.push_back(User);
+ }
+
+ for (auto &U : Users)
+ U->replaceUsesOfWith(From, To);
+ };
+
+ auto FixConst = [&](ConstantInt *Const, Instruction *I) {
+ Constant *NewConst = nullptr;
+ if (isSafeOverflow(I)) {
+ NewConst = (Const->isNegative()) ?
+ ConstantExpr::getSExt(Const, ExtTy) :
+ ConstantExpr::getZExt(Const, ExtTy);
+ } else {
+ uint64_t NewVal = *Const->getValue().getRawData();
+ if (Const->getType() == Type::getInt16Ty(Ctx))
+ NewVal &= 0xFFFF;
+ else
+ NewVal &= 0xFF;
+ NewConst = ConstantInt::get(ExtTy, NewVal);
+ }
+ I->replaceUsesOfWith(Const, NewConst);
+ };
+
+ auto InsertDSPIntrinsic = [&](Instruction *I) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
+ << *I << "\n");
+ Function *DSPInst =
+ Intrinsic::getDeclaration(M, getNarrowIntrinsic(I, TypeSize));
+ Builder.SetInsertPoint(I);
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ Value *Args[] = { I->getOperand(0), I->getOperand(1) };
+ CallInst *Call = Builder.CreateCall(DSPInst, Args);
+ ReplaceAllUsersOfWith(I, Call);
+ InstsToRemove.push_back(I);
+ NewInsts.insert(Call);
+ };
+
+ auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n");
+ Builder.SetInsertPoint(InsertPt);
+ if (auto *I = dyn_cast<Instruction>(V))
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ auto *ZExt = cast<Instruction>(Builder.CreateZExt(V, ExtTy));
+ if (isa<Argument>(V))
+ ZExt->moveBefore(InsertPt);
+ else
+ ZExt->moveAfter(InsertPt);
+ ReplaceAllUsersOfWith(V, ZExt);
+ NewInsts.insert(ZExt);
+ };
+
+ // First, insert extending instructions between the leaves and their users.
+ LLVM_DEBUG(dbgs() << "ARM CGP: Promoting leaves:\n");
+ for (auto V : Leaves) {
+ LLVM_DEBUG(dbgs() << " - " << *V << "\n");
+ if (auto *ZExt = dyn_cast<ZExtInst>(V))
+ ZExt->mutateType(ExtTy);
+ else if (auto *I = dyn_cast<Instruction>(V))
+ InsertZExt(I, I);
+ else if (auto *Arg = dyn_cast<Argument>(V)) {
+ BasicBlock &BB = Arg->getParent()->front();
+ InsertZExt(Arg, &*BB.getFirstInsertionPt());
+ } else {
+ llvm_unreachable("unhandled leaf that needs extending");
+ }
+ Promoted.insert(V);
+ }
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n");
+ // Then mutate the types of the instructions within the tree. Here we handle
+ // constant operands.
+ for (auto *V : Visited) {
+ if (Leaves.count(V))
+ continue;
+
+ if (!isa<Instruction>(V))
+ continue;
+
+ auto *I = cast<Instruction>(V);
+ if (Roots.count(I))
+ continue;
+
+ for (auto &U : I->operands()) {
+ if ((U->getType() == ExtTy) || !isSupportedType(&*U))
+ continue;
+
+ if (auto *Const = dyn_cast<ConstantInt>(&*U))
+ FixConst(Const, I);
+ else if (isa<UndefValue>(&*U))
+ U->mutateType(ExtTy);
+ }
+
+ if (shouldPromote(I)) {
+ I->mutateType(ExtTy);
+ Promoted.insert(I);
+ }
+ }
+
+ // Now we need to remove any zexts that have become unnecessary, as well
+ // as insert any intrinsics.
+ for (auto *V : Visited) {
+ if (Leaves.count(V))
+ continue;
+ if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
+ if (ZExt->getDestTy() != ExtTy) {
+ ZExt->mutateType(ExtTy);
+ Promoted.insert(ZExt);
+ }
+ else if (ZExt->getSrcTy() == ExtTy) {
+ ReplaceAllUsersOfWith(V, ZExt->getOperand(0));
+ InstsToRemove.push_back(ZExt);
+ }
+ continue;
+ }
+
+ if (!shouldPromote(V) || isPromotedResultSafe(V))
+ continue;
+
+ // Replace unsafe instructions with appropriate intrinsic calls.
+ InsertDSPIntrinsic(cast<Instruction>(V));
+ }
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the roots:\n");
+ // Fix up any stores or returns that use the results of the promoted
+ // chain.
+ for (auto I : Roots) {
+ LLVM_DEBUG(dbgs() << " - " << *I << "\n");
+ Type *TruncTy = OrigTy;
+ if (auto *Store = dyn_cast<StoreInst>(I)) {
+ auto *PtrTy = cast<PointerType>(Store->getPointerOperandType());
+ TruncTy = PtrTy->getElementType();
+ } else if (isa<ReturnInst>(I)) {
+ Function *F = I->getParent()->getParent();
+ TruncTy = F->getFunctionType()->getReturnType();
+ }
+
+ for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+ Value *V = I->getOperand(i);
+ if (Promoted.count(V) || NewInsts.count(V)) {
+ if (auto *Op = dyn_cast<Instruction>(V)) {
+
+ if (auto *Call = dyn_cast<CallInst>(I))
+ TruncTy = Call->getFunctionType()->getParamType(i);
+
+ if (TruncTy == ExtTy)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy
+ << " Trunc for " << *Op << "\n");
+ Builder.SetInsertPoint(Op);
+ auto *Trunc = cast<Instruction>(Builder.CreateTrunc(Op, TruncTy));
+ Trunc->moveBefore(I);
+ I->setOperand(i, Trunc);
+ NewInsts.insert(Trunc);
+ }
+ }
+ }
+ }
+ LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete.\n");
+}
+
+bool ARMCodeGenPrepare::isNarrowInstSupported(Instruction *I) {
+ if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
+ return false;
+
+ if (ST->isThumb() && !ST->hasThumb2())
+ return false;
+
+ if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
+ return false;
+
+ // TODO
+ // Would it be profitable? For Thumb code, these parallel DSP instructions
+ // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
+ // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
+ // halved. They also do not take immediates as operands.
+ for (auto &Op : I->operands()) {
+ if (isa<Constant>(Op)) {
+ if (!EnableDSPWithImms)
+ return false;
+ }
+ }
+ return true;
+}
+
+/// We accept most instructions, as well as Arguments and ConstantInsts. We
+/// Disallow casts other than zext and truncs and only allow calls if their
+/// return value is zeroext. We don't allow opcodes that can introduce sign
+/// bits.
+bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Is " << *V << " supported?\n");
+
+ // Non-instruction values that we can handle.
+ if (isa<ConstantInt>(V) || isa<Argument>(V))
+ return true;
+
+ // Memory instructions
+ if (isa<StoreInst>(V) || isa<LoadInst>(V) || isa<GetElementPtrInst>(V))
+ return true;
+
+ // Branches and targets.
+ if (auto *ICmp = dyn_cast<ICmpInst>(V))
+ return ICmp->isEquality() || !ICmp->isSigned();
+
+ if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V))
+ return true;
+
+ if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V))
+ return true;
+
+ // Special cases for calls as we need to check for zeroext
+ // TODO We should accept calls even if they don't have zeroext, as they can
+ // still be roots.
+ if (auto *Call = dyn_cast<CallInst>(V))
+ return Call->hasRetAttr(Attribute::AttrKind::ZExt);
+ else if (auto *Cast = dyn_cast<CastInst>(V)) {
+ if (isa<ZExtInst>(Cast))
+ return Cast->getDestTy()->getScalarSizeInBits() <= 32;
+ else if (auto *Trunc = dyn_cast<TruncInst>(V))
+ return Trunc->getDestTy()->getScalarSizeInBits() <= TypeSize;
+ else {
+ LLVM_DEBUG(dbgs() << "ARM CGP: No, unsupported cast.\n");
+ return false;
+ }
+ } else if (!isa<BinaryOperator>(V)) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: No, not a binary operator.\n");
+ return false;
+ }
+
+ bool res = !isSigned(V);
+ if (!res)
+ LLVM_DEBUG(dbgs() << "ARM CGP: No, it's a signed instruction.\n");
+ return res;
+}
+
+/// Check that the type of V would be promoted and that the original type is
+/// smaller than the targeted promoted type. Check that we're not trying to
+/// promote something larger than our base 'TypeSize' type.
+bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
+ if (!isSupportedType(V))
+ return false;
+
+ unsigned VSize = 0;
+ if (auto *Ld = dyn_cast<LoadInst>(V)) {
+ auto *PtrTy = cast<PointerType>(Ld->getPointerOperandType());
+ VSize = PtrTy->getElementType()->getPrimitiveSizeInBits();
+ } else if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
+ VSize = ZExt->getOperand(0)->getType()->getPrimitiveSizeInBits();
+ } else {
+ VSize = V->getType()->getPrimitiveSizeInBits();
+ }
+
+ if (VSize > TypeSize)
+ return false;
+
+ if (isPromotedResultSafe(V))
+ return true;
+
+ if (auto *I = dyn_cast<Instruction>(V))
+ return isNarrowInstSupported(I);
+
+ return false;
+}
+
+bool ARMCodeGenPrepare::TryToPromote(Value *V) {
+ OrigTy = V->getType();
+ TypeSize = OrigTy->getPrimitiveSizeInBits();
+
+ if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << "\n");
+
+ SetVector<Value*> WorkList;
+ SmallPtrSet<Value*, 8> Leaves;
+ SmallPtrSet<Instruction*, 4> Roots;
+ WorkList.insert(V);
+ SmallPtrSet<Value*, 16> CurrentVisited;
+ CurrentVisited.clear();
+
+ // Return true if the given value can, or has been, visited. Add V to the
+ // worklist if needed.
+ auto AddLegalInst = [&](Value *V) {
+ if (CurrentVisited.count(V))
+ return true;
+
+ if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n");
+ return false;
+ }
+
+ WorkList.insert(V);
+ return true;
+ };
+
+ // Iterate through, and add to, a tree of operands and users in the use-def.
+ while (!WorkList.empty()) {
+ Value *V = WorkList.back();
+ WorkList.pop_back();
+ if (CurrentVisited.count(V))
+ continue;
+
+ if (!isa<Instruction>(V) && !isSource(V))
+ continue;
+
+ // If we've already visited this value from somewhere, bail now because
+ // the tree has already been explored.
+ // TODO: This could limit the transform, ie if we try to promote something
+ // from an i8 and fail first, before trying an i16.
+ if (AllVisited.count(V)) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Already visited this: " << *V << "\n");
+ return false;
+ }
+
+ CurrentVisited.insert(V);
+ AllVisited.insert(V);
+
+ // Calls can be both sources and sinks.
+ if (isSink(V))
+ Roots.insert(cast<Instruction>(V));
+ if (isSource(V))
+ Leaves.insert(V);
+ else if (auto *I = dyn_cast<Instruction>(V)) {
+ // Visit operands of any instruction visited.
+ for (auto &U : I->operands()) {
+ if (!AddLegalInst(U))
+ return false;
+ }
+ }
+
+ // Don't visit users of a node which isn't going to be mutated unless its a
+ // source.
+ if (isSource(V) || shouldPromote(V)) {
+ for (Use &U : V->uses()) {
+ if (!AddLegalInst(U.getUser()))
+ return false;
+ }
+ }
+ }
+
+ unsigned NumToPromote = 0;
+ unsigned Cost = 0;
+ for (auto *V : CurrentVisited) {
+ // Truncs will cause a uxt and no zeroext arguments will often require
+ // a uxt somewhere.
+ if (isa<TruncInst>(V))
+ ++Cost;
+ else if (auto *Arg = dyn_cast<Argument>(V)) {
+ if (!Arg->hasZExtAttr())
+ ++Cost;
+ }
+
+ // Mem ops can automatically be extended/truncated and non-instructions
+ // don't need anything done.
+ if (Leaves.count(V) || isa<StoreInst>(V) || !isa<Instruction>(V))
+ continue;
+
+ // Will need to truncate calls args and returns.
+ if (Roots.count(cast<Instruction>(V))) {
+ ++Cost;
+ continue;
+ }
+
+ if (shouldPromote(V))
+ ++NumToPromote;
+ }
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n";
+ for (auto *I : CurrentVisited)
+ I->dump();
+ );
+ LLVM_DEBUG(dbgs() << "ARM CGP: Cost of promoting " << NumToPromote
+ << " instructions = " << Cost << "\n");
+ if (Cost > NumToPromote || (NumToPromote == 0))
+ return false;
+
+ Promoter->Mutate(OrigTy, CurrentVisited, Leaves, Roots);
+ return true;
+}
+
+bool ARMCodeGenPrepare::doInitialization(Module &M) {
+ Promoter = new IRPromoter(&M);
+ return false;
+}
+
+bool ARMCodeGenPrepare::runOnFunction(Function &F) {
+ if (skipFunction(F) || DisableCGP)
+ return false;
+
+ auto *TPC = &getAnalysis<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ const TargetMachine &TM = TPC->getTM<TargetMachine>();
+ ST = &TM.getSubtarget<ARMSubtarget>(F);
+ bool MadeChange = false;
+ LLVM_DEBUG(dbgs() << "ARM CGP: Running on " << F.getName() << "\n");
+
+ // Search up from icmps to try to promote their operands.
+ for (BasicBlock &BB : F) {
+ auto &Insts = BB.getInstList();
+ for (auto &I : Insts) {
+ if (AllVisited.count(&I))
+ continue;
+
+ if (isa<ICmpInst>(I)) {
+ auto &CI = cast<ICmpInst>(I);
+
+ // Skip signed or pointer compares
+ if (CI.isSigned() || !isa<IntegerType>(CI.getOperand(0)->getType()))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n");
+ for (auto &Op : CI.operands()) {
+ if (auto *I = dyn_cast<Instruction>(Op)) {
+ if (isa<ZExtInst>(I))
+ MadeChange |= TryToPromote(I->getOperand(0));
+ else
+ MadeChange |= TryToPromote(I);
+ }
+ }
+ }
+ }
+ Promoter->Cleanup();
+ LLVM_DEBUG(if (verifyFunction(F, &dbgs())) {
+ dbgs();
+ report_fatal_error("Broken function after type promotion");
+ });
+ }
+ if (MadeChange)
+ LLVM_DEBUG(dbgs() << "After ARMCodeGenPrepare: " << F << "\n");
+
+ return MadeChange;
+}
+
+INITIALIZE_PASS_BEGIN(ARMCodeGenPrepare, DEBUG_TYPE,
+ "ARM IR optimizations", false, false)
+INITIALIZE_PASS_END(ARMCodeGenPrepare, DEBUG_TYPE, "ARM IR optimizations",
+ false, false)
+
+char ARMCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createARMCodeGenPreparePass() {
+ return new ARMCodeGenPrepare();
+}
initializeARMLoadStoreOptPass(Registry);
initializeARMPreAllocLoadStoreOptPass(Registry);
initializeARMParallelDSPPass(Registry);
+ initializeARMCodeGenPreparePass(Registry);
initializeARMConstantIslandsPass(Registry);
initializeARMExecutionDomainFixPass(Registry);
initializeARMExpandPseudoPass(Registry);
}
void addIRPasses() override;
+ void addCodeGenPrepare() override;
bool addPreISel() override;
bool addInstSelector() override;
bool addIRTranslator() override;
addPass(createInterleavedAccessPass());
}
+void ARMPassConfig::addCodeGenPrepare() {
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createARMCodeGenPreparePass());
+ TargetPassConfig::addCodeGenPrepare();
+}
+
bool ARMPassConfig::addPreISel() {
if (getOptLevel() != CodeGenOpt::None)
addPass(createARMParallelDSPPass());
ARMBaseInstrInfo.cpp
ARMBaseRegisterInfo.cpp
ARMCallLowering.cpp
+ ARMCodeGenPrepare.cpp
ARMConstantIslandPass.cpp
ARMConstantPoolValue.cpp
ARMExpandPseudoInsts.cpp
--- /dev/null
+; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
+; RUN: llc -mtriple=thumbv7em %s -arm-enable-scalar-dsp=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumbv8 %s -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM
+
+; CHECK-COMMON-LABEL: test_ult_254_inc_imm:
+; CHECK-DSP: adds r0, #1
+; CHECK-DSP-NEXT: uxtb r1, r0
+; CHECK-DSP-NEXT: movs r0, #47
+; CHECK-DSP-NEXT: cmp r1, #254
+; CHECK-DSP-NEXT: it lo
+; CHECK-DSP-NEXT: movlo r0, #35
+
+; CHECK-DSP-IMM: movs r1, #1
+; CHECK-DSP-IMM-NEXT: uadd8 r1, r0, r1
+; CHECK-DSP-IMM-NEXT: movs r0, #47
+; CHECK-DSP-IMM-NEXT: cmp r1, #254
+; CHECK-DSP-IMM-NEXT: it lo
+; CHECK-DSP-IMM-NEXT: movlo r0, #35
+define i32 @test_ult_254_inc_imm(i8 zeroext %x) {
+entry:
+ %add = add i8 %x, 1
+ %cmp = icmp ult i8 %add, 254
+ %res = select i1 %cmp, i32 35, i32 47
+ ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_slt_254_inc_imm
+; CHECK-COMMON: adds
+; CHECK-COMMON: sxtb
+define i32 @test_slt_254_inc_imm(i8 signext %x) {
+entry:
+ %add = add i8 %x, 1
+ %cmp = icmp slt i8 %add, 254
+ %res = select i1 %cmp, i32 35, i32 47
+ ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_ult_254_inc_var:
+; CHECK-NODSP: add r0, r1
+; CHECK-NODSP-NEXT: uxtb r1, r0
+; CHECK-NODSP-NEXT: movs r0, #47
+; CHECK-NODSP-NEXT: cmp r1, #254
+; CHECK-NODSP-NEXT: it lo
+; CHECK-NODSP-NEXT: movlo r0, #35
+
+; CHECK-DSP: uadd8 r1, r0, r1
+; CHECK-DSP-NEXT: movs r0, #47
+; CHECK-DSP-NEXT: cmp r1, #254
+; CHECK-DSP-NEXT: it lo
+; CHECK-DSP-NEXT: movlo r0, #35
+define i32 @test_ult_254_inc_var(i8 zeroext %x, i8 zeroext %y) {
+entry:
+ %add = add i8 %x, %y
+ %cmp = icmp ult i8 %add, 254
+ %res = select i1 %cmp, i32 35, i32 47
+ ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_sle_254_inc_var
+; CHECK-COMMON: add
+; CHECK-COMMON: sxtb
+; CHECK-COMMON: cmp
+define i32 @test_sle_254_inc_var(i8 %x, i8 %y) {
+entry:
+ %add = add i8 %x, %y
+ %cmp = icmp sle i8 %add, 254
+ %res = select i1 %cmp, i32 35, i32 47
+ ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_ugt_1_dec_imm:
+; CHECK-COMMON: subs r1, r0, #1
+; CHECK-COMMON-NEXT: movs r0, #47
+; CHECK-COMMON-NEXT: cmp r1, #1
+; CHECK-COMMON-NEXT: it hi
+; CHECK-COMMON-NEXT: movhi r0, #35
+define i32 @test_ugt_1_dec_imm(i8 zeroext %x) {
+entry:
+ %add = add i8 %x, -1
+ %cmp = icmp ugt i8 %add, 1
+ %res = select i1 %cmp, i32 35, i32 47
+ ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_sgt_1_dec_imm
+; CHECK-COMMON: subs
+; CHECK-COMMON: sxtb
+; CHECK-COMMON: cmp
+define i32 @test_sgt_1_dec_imm(i8 %x) {
+entry:
+ %add = add i8 %x, -1
+ %cmp = icmp sgt i8 %add, 1
+ %res = select i1 %cmp, i32 35, i32 47
+ ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_ugt_1_dec_var:
+; CHECK-NODSP: subs r0, r0, r1
+; CHECK-NODSP-NEXT: uxtb r1, r0
+; CHECK-NODSP-NEXT: movs r0, #47
+; CHECK-NODSP-NEXT: cmp r1, #1
+; CHECK-NODSP-NEXT: it hi
+; CHECK-NODSP-NEXT: movhi r0, #35
+
+; CHECK-DSP: usub8 r1, r0, r1
+; CHECK-DSP-NEXT: movs r0, #47
+; CHECK-DSP-NEXT: cmp r1, #1
+; CHECK-DSP-NEXT: it hi
+; CHECK-DSP-NEXT: movhi r0, #35
+define i32 @test_ugt_1_dec_var(i8 zeroext %x, i8 zeroext %y) {
+entry:
+ %sub = sub i8 %x, %y
+ %cmp = icmp ugt i8 %sub, 1
+ %res = select i1 %cmp, i32 35, i32 47
+ ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: test_sge_1_dec_var
+; CHECK-COMMON: sub
+; CHECK-COMMON: sxtb
+; CHECK-COMMON: cmp
+define i32 @test_sge_1_dec_var(i8 %x, i8 %y) {
+entry:
+ %sub = sub i8 %x, %y
+ %cmp = icmp sge i8 %sub, 1
+ %res = select i1 %cmp, i32 35, i32 47
+ ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: dsp_imm1:
+; CHECK-DSP: eors r1, r0
+; CHECK-DSP-NEXT: and r0, r0, #7
+; CHECK-DSP-NEXT: subs r0, r0, r1
+; CHECK-DSP-NEXT: adds r0, #1
+; CHECK-DSP-NEXT: uxtb r1, r0
+; CHECK-DSP-NEXT: movs r0, #47
+; CHECK-DSP-NEXT: cmp r1, #254
+; CHECK-DSP-NEXT: it lo
+; CHECK-DSP-NEXT: movlo r0, #35
+
+; CHECK-DSP-IMM: eors r1, r0
+; CHECK-DSP-IMM-NEXT: and r0, r0, #7
+; CHECK-DSP-IMM-NEXT: usub8 r0, r0, r1
+; CHECK-DSP-IMM-NEXT: movs r1, #1
+; CHECK-DSP-IMM-NEXT: uadd8 r1, r0, r1
+; CHECK-DSP-IMM-NEXT: movs r0, #47
+; CHECK-DSP-IMM-NEXT: cmp r1, #254
+; CHECK-DSP-IMM-NEXT: it lo
+; CHECK-DSP-IMM-NEXT: movlo r0, #35
+define i32 @dsp_imm1(i8 zeroext %x, i8 zeroext %y) {
+entry:
+ %xor = xor i8 %x, %y
+ %and = and i8 %x, 7
+ %sub = sub i8 %and, %xor
+ %add = add i8 %sub, 1
+ %cmp = icmp ult i8 %add, 254
+ %res = select i1 %cmp, i32 35, i32 47
+ ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: dsp_imm2
+; CHECK-COMMON: add r0, r1
+; CHECK-DSP-NEXT: ldrh r1, [r3]
+; CHECK-DSP-NEXT: ldrh r2, [r2]
+; CHECK-DSP-NEXT: subs r1, r1, r0
+; CHECK-DSP-NEXT: add r0, r2
+; CHECK-DSP-NEXT: uxth r3, r1
+; CHECK-DSP-NEXT: uxth r2, r0
+; CHECK-DSP-NEXT: cmp r2, r3
+
+; CHECK-DSP-IMM: movs r1, #0
+; CHECK-DSP-IMM-NEXT: uxth r0, r0
+; CHECK-DSP-IMM-NEXT: usub16 r1, r1, r0
+; CHECK-DSP-IMM-NEXT: ldrh r0, [r2]
+; CHECK-DSP-IMM-NEXT: ldrh r3, [r3]
+; CHECK-DSP-IMM-NEXT: usub16 r0, r0, r1
+; CHECK-DSP-IMM-NEXT: uadd16 r1, r3, r1
+; CHECK-DSP-IMM-NEXT: cmp r0, r1
+
+define i16 @dsp_imm2(i32 %arg0, i32 %arg1, i16* %gep0, i16* %gep1) {
+entry:
+ %add0 = add i32 %arg0, %arg1
+ %conv0 = trunc i32 %add0 to i16
+ %sub0 = sub i16 0, %conv0
+ %load0 = load i16, i16* %gep0, align 2
+ %load1 = load i16, i16* %gep1, align 2
+ %sub1 = sub i16 %load0, %sub0
+ %add1 = add i16 %load1, %sub0
+ %cmp = icmp ult i16 %sub1, %add1
+ %res = select i1 %cmp, i16 %add1, i16 %sub1
+ ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: dsp_var:
+; CHECK-COMMON: eors r1, r0
+; CHECK-COMMON: and r2, r0, #7
+; CHECK-NODSP: subs r1, r2, r1
+; CHECK-NODSP: add.w r0, r1, r0, lsl #1
+; CHECK-NODSP: uxtb r1, r0
+; CHECK-DSP: usub8 r1, r2, r1
+; CHECK-DSP: lsls r0, r0, #1
+; CHECK-DSP: uadd8 r1, r1, r0
+; CHECK-DSP-NOT: uxt
+; CHECK-COMMON: movs r0, #47
+; CHECK-COMMON: cmp r1, #254
+; CHECK-COMMON: it lo
+; CHECK-COMMON: movlo r0, #35
+define i32 @dsp_var(i8 zeroext %x, i8 zeroext %y) {
+ %xor = xor i8 %x, %y
+ %and = and i8 %x, 7
+ %sub = sub i8 %and, %xor
+ %mul = shl nuw i8 %x, 1
+ %add = add i8 %sub, %mul
+ %cmp = icmp ult i8 %add, 254
+ %res = select i1 %cmp, i32 35, i32 47
+ ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: store_dsp_res
+; CHECK-DSP: usub8
+; CHECK-DSP: strb
+define void @store_dsp_res(i8* %in, i8* %out, i8 %compare) {
+ %first = getelementptr inbounds i8, i8* %in, i32 0
+ %second = getelementptr inbounds i8, i8* %in, i32 1
+ %ld0 = load i8, i8* %first
+ %ld1 = load i8, i8* %second
+ %xor = xor i8 %ld0, -1
+ %cmp = icmp ult i8 %compare, %ld1
+ %select = select i1 %cmp, i8 %compare, i8 %xor
+ %sub = sub i8 %ld0, %select
+ store i8 %sub, i8* %out, align 1
+ ret void
+}
+
+; CHECK-COMMON-LABEL: ugt_1_dec_imm:
+; CHECK-COMMON: subs r1, r0, #1
+; CHECK-COMMON-NEXT: movs r0, #47
+; CHECK-COMMON-NEXT: cmp r1, #1
+; CHECK-COMMON-NEXT: it hi
+; CHECK-COMMON-NEXT: movhi r0, #35
+define i32 @ugt_1_dec_imm(i8 zeroext %x) {
+entry:
+ %add = add i8 %x, -1
+ %cmp = icmp ugt i8 %add, 1
+ %res = select i1 %cmp, i32 35, i32 47
+ ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: ugt_1_dec_var:
+; CHECK-NODSP: subs r0, r0, r1
+; CHECK-NODSP-NEXT: uxtb r1, r0
+; CHECK-NODSP-NEXT: movs r0, #47
+; CHECK-NODSP-NEXT: cmp r1, #1
+; CHECK-NODSP-NEXT: it hi
+; CHECK-NODSP-NEXT: movhi r0, #35
+
+; CHECK-DSP: usub8 r1, r0, r1
+; CHECK-DSP-NEXT: movs r0, #47
+; CHECK-DSP-NEXT: cmp r1, #1
+; CHECK-DSP-NEXT: it hi
+; CHECK-DSP-NEXT: movhi r0, #35
+define i32 @ugt_1_dec_var(i8 zeroext %x, i8 zeroext %y) {
+entry:
+ %sub = sub i8 %x, %y
+ %cmp = icmp ugt i8 %sub, 1
+ %res = select i1 %cmp, i32 35, i32 47
+ ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: icmp_i32_zext:
+; CHECK-COMMON: ldrb [[LD:r[^ ]+]], [r0]
+; CHECK-COMMON: subs [[SUB:r[^ ]+]], [[LD]], #1
+; CHECK-COMMON-NOT: uxt
+; CHECK-COMMON: cmp [[LD]], [[SUB]]
+; CHECK-COMMON-NOT: uxt
+define i8 @icmp_i32_zext(i8* %ptr) {
+entry:
+ %gep = getelementptr inbounds i8, i8* %ptr, i32 0
+ %0 = load i8, i8* %gep, align 1
+ %1 = sub nuw nsw i8 %0, 1
+ %conv44 = zext i8 %0 to i32
+ br label %preheader
+
+preheader:
+ br label %body
+
+body:
+ %2 = phi i8 [ %1, %preheader ], [ %3, %if.end ]
+ %si.0274 = phi i32 [ %conv44, %preheader ], [ %inc, %if.end ]
+ %conv51266 = zext i8 %2 to i32
+ %cmp52267 = icmp eq i32 %si.0274, %conv51266
+ br i1 %cmp52267, label %if.end, label %exit
+
+if.end:
+ %inc = add i32 %si.0274, 1
+ %gep1 = getelementptr inbounds i8, i8* %ptr, i32 %inc
+ %3 = load i8, i8* %gep1, align 1
+ br label %body
+
+exit:
+ ret i8 %2
+}
+
+@d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1
+@sh1 = hidden local_unnamed_addr global i16 0, align 2
+@d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2
+
+; CHECK-COMMON-LABEL: icmp_sext_zext_store_i8_i16
+; CHECK-NODSP: ldrb [[BYTE:r[^ ]+]],
+; CHECK-NODSP: strh [[BYTE]],
+; CHECK-NODSP: ldrsh.w
+define i32 @icmp_sext_zext_store_i8_i16() {
+entry:
+ %0 = load i8, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @d_uch, i32 0, i32 2), align 1
+ %conv = zext i8 %0 to i16
+ store i16 %conv, i16* @sh1, align 2
+ %conv1 = zext i8 %0 to i32
+ %1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @d_sh, i32 0, i32 2), align 2
+ %conv2 = sext i16 %1 to i32
+ %cmp = icmp eq i32 %conv1, %conv2
+ %conv3 = zext i1 %cmp to i32
+ ret i32 %conv3
+}
+
+; CHECK-COMMON-LABEL: or_icmp_ugt:
+; CHECK-COMMON: ldrb [[LD:r[^ ]+]], [r1]
+; CHECK-COMMON: subs [[SUB:r[^ ]+]], #1
+; CHECK-COMMON-NOT: uxtb
+; CHECK-COMMON: cmp [[SUB]], #3
+define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) {
+entry:
+ %0 = load i8, i8* %ptr
+ %1 = zext i8 %0 to i32
+ %mul = shl nuw nsw i32 %1, 1
+ %add0 = add nuw nsw i32 %mul, 6
+ %cmp0 = icmp ne i32 %arg, %add0
+ %add1 = add i8 %0, -1
+ %cmp1 = icmp ugt i8 %add1, 3
+ %or = or i1 %cmp0, %cmp1
+ ret i1 %or
+}
+
+; CHECK-COMMON-LABEL: icmp_switch_trunc:
+; CHECK-COMMON-NOT: uxt
+define i16 @icmp_switch_trunc(i16 zeroext %arg) {
+entry:
+ %conv = add nuw i16 %arg, 15
+ %mul = mul nuw nsw i16 %conv, 3
+ %trunc = trunc i16 %arg to i3
+ switch i3 %trunc, label %default [
+ i3 0, label %sw.bb
+ i3 1, label %sw.bb.i
+ ]
+
+sw.bb:
+ %cmp0 = icmp ult i16 %mul, 127
+ %select = select i1 %cmp0, i16 %mul, i16 127
+ br label %exit
+
+sw.bb.i:
+ %cmp1 = icmp ugt i16 %mul, 34
+ %select.i = select i1 %cmp1, i16 %mul, i16 34
+ br label %exit
+
+default:
+ br label %exit
+
+exit:
+ %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
+ ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: icmp_eq_minus_one
+; CHECK-COMMON: cmp r0, #255
+define i32 @icmp_eq_minus_one(i8* %ptr) {
+ %load = load i8, i8* %ptr, align 1
+ %conv = zext i8 %load to i32
+ %cmp = icmp eq i8 %load, -1
+ %ret = select i1 %cmp, i32 %conv, i32 -1
+ ret i32 %ret
+}
+
+; CHECK-COMMON-LABEL: icmp_not
+; CHECK-COMMON: movw r2, #65535
+; CHECK-COMMON: eors r2, r0
+; CHECK-COMMON: movs r0, #32
+; CHECK-COMMON: cmp r2, r1
+define i32 @icmp_not(i16 zeroext %arg0, i16 zeroext %arg1) {
+ %not = xor i16 %arg0, -1
+ %cmp = icmp eq i16 %not, %arg1
+ %res = select i1 %cmp, i32 16, i32 32
+ ret i32 %res
+}
+
+; CHECK-COMMON-LABEL: mul_wrap
+; CHECK-COMMON: mul
+; CHECK-COMMON: uxth
+; CHECK-COMMON: cmp
+define i16 @mul_wrap(i16 %arg0, i16 %arg1) {
+ %mul = mul i16 %arg0, %arg1
+ %cmp = icmp eq i16 %mul, 1
+ %res = select i1 %cmp, i16 %arg0, i16 47
+ ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: shl_wrap
+; CHECK-COMMON: lsl
+; CHECK-COMMON: uxth
+; CHECK-COMMON: cmp
+define i16 @shl_wrap(i16 %arg0) {
+ %mul = shl i16 %arg0, 4
+ %cmp = icmp eq i16 %mul, 1
+ %res = select i1 %cmp, i16 %arg0, i16 47
+ ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: add_wrap
+; CHECK-COMMON: add
+; CHECK-COMMON: uxth
+; CHECK-COMMON: cmp
+define i16 @add_wrap(i16 %arg0, i16 %arg1) {
+ %add = add i16 %arg0, 128
+ %cmp = icmp eq i16 %add, %arg1
+ %res = select i1 %cmp, i16 %arg0, i16 1
+ ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: sub_wrap
+; CHECK-COMMON: sub
+; CHECK-COMMON: uxth
+; CHECK-COMMON: cmp
+define i16 @sub_wrap(i16 %arg0, i16 %arg1, i16 %arg2) {
+ %sub = sub i16 %arg0, %arg2
+ %cmp = icmp eq i16 %sub, %arg1
+ %res = select i1 %cmp, i16 %arg0, i16 1
+ ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: urem_trunc_icmps
+; CHECK-COMMON-NOT: uxt
+define void @urem_trunc_icmps(i16** %in, i32* %g, i32* %k) {
+entry:
+ %ptr = load i16*, i16** %in, align 4
+ %ld = load i16, i16* %ptr, align 2
+ %cmp.i = icmp eq i16 %ld, 0
+ br i1 %cmp.i, label %exit, label %cond.false.i
+
+cond.false.i:
+ %rem = urem i16 5, %ld
+ %extract.t = trunc i16 %rem to i8
+ br label %body
+
+body:
+ %cond.in.i.off0 = phi i8 [ %extract.t, %cond.false.i ], [ %add, %for.inc ]
+ %cmp = icmp ugt i8 %cond.in.i.off0, 7
+ %conv5 = zext i1 %cmp to i32
+ store i32 %conv5, i32* %g, align 4
+ %.pr = load i32, i32* %k, align 4
+ %tobool13150 = icmp eq i32 %.pr, 0
+ br i1 %tobool13150, label %for.inc, label %exit
+
+for.inc:
+ %add = add nuw i8 %cond.in.i.off0, 1
+ br label %body
+
+exit:
+ ret void
+}
--- /dev/null
+; RUN: llc -mtriple=thumbv7m %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
+; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
+; RUN: llc -mtriple=thumbv8m.main -arm-enable-scalar-dsp=true -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumbv7em %s -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM
+
+; Test that ARMCodeGenPrepare can handle:
+; - loops
+; - call operands
+; - call return values
+; - ret instructions
+; We use nuw on the arithmetic instructions to avoid complications.
+
+; Check that the arguments are extended but then nothing else is.
+; This also ensures that the pass can handle loops.
+; CHECK-COMMON-LABEL: phi_feeding_phi_args
+; CHECK-COMMON: uxtb
+; CHECK-COMMON: uxtb
+; CHECK-NOT: uxtb
+define void @phi_feeding_phi_args(i8 %a, i8 %b) {
+entry:
+ %0 = icmp ugt i8 %a, %b
+ br i1 %0, label %preheader, label %empty
+
+empty:
+ br label %preheader
+
+preheader:
+ %1 = phi i8 [ %a, %entry ], [ %b, %empty ]
+ br label %loop
+
+loop:
+ %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ]
+ %cmp = icmp ult i8 %val, 254
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %inc = sub nuw i8 %val, 2
+ br label %if.end
+
+if.else:
+ %inc1 = shl nuw i8 %val, 1
+ br label %if.end
+
+if.end:
+ %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ]
+ %cmp1 = icmp eq i8 %inc2, 255
+ br i1 %cmp1, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; Same as above, but as the args are zeroext, we shouldn't see any uxts.
+; CHECK-COMMON-LABEL: phi_feeding_phi_zeroext_args
+; CHECK-COMMON-NOT: uxt
+define void @phi_feeding_phi_zeroext_args(i8 zeroext %a, i8 zeroext %b) {
+entry:
+ %0 = icmp ugt i8 %a, %b
+ br i1 %0, label %preheader, label %empty
+
+empty:
+ br label %preheader
+
+preheader:
+ %1 = phi i8 [ %a, %entry ], [ %b, %empty ]
+ br label %loop
+
+loop:
+ %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ]
+ %cmp = icmp ult i8 %val, 254
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %inc = sub nuw i8 %val, 2
+ br label %if.end
+
+if.else:
+ %inc1 = shl nuw i8 %val, 1
+ br label %if.end
+
+if.end:
+ %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ]
+ %cmp1 = icmp eq i8 %inc2, 255
+ br i1 %cmp1, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; Just check that phis also work with i16s.
+; CHECK-COMMON-LABEL: phi_i16:
+; CHECK-COMMON-NOT: uxt
+define void @phi_i16() {
+entry:
+ br label %loop
+
+loop:
+ %val = phi i16 [ 0, %entry ], [ %inc2, %if.end ]
+ %cmp = icmp ult i16 %val, 128
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %inc = add nuw i16 %val, 2
+ br label %if.end
+
+if.else:
+ %inc1 = add nuw i16 %val, 1
+ br label %if.end
+
+if.end:
+ %inc2 = phi i16 [ %inc, %if.then], [ %inc1, %if.else ]
+ %cmp1 = icmp ult i16 %inc2, 253
+ br i1 %cmp1, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+; CHECK-COMMON-LABEL: phi_feeding_switch
+; CHECK-COMMON: ldrb
+; CHECK-COMMON: uxtb
+; CHECK-COMMON-NOT: uxt
+define void @phi_feeding_switch(i8* %memblock, i8* %store, i16 %arg) {
+entry:
+ %pre = load i8, i8* %memblock, align 1
+ %conv = trunc i16 %arg to i8
+ br label %header
+
+header:
+ %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ]
+ %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ]
+ %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ]
+ switch i8 %phi.0, label %default [
+ i8 43, label %for.inc.i
+ i8 45, label %for.inc.i.i
+ ]
+
+for.inc.i:
+ %xor = xor i8 %phi.1, 1
+ br label %latch
+
+for.inc.i.i:
+ %and = and i8 %phi.1, 3
+ br label %latch
+
+default:
+ %sub = sub i8 %phi.0, 1
+ %cmp2 = icmp ugt i8 %sub, 4
+ br i1 %cmp2, label %latch, label %exit
+
+latch:
+ %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ]
+ %count = add nuw i8 %phi.2, 1
+ store i8 %count, i8* %store, align 1
+ br label %header
+
+exit:
+ ret void
+}
+
+; CHECK-COMMON-LABEL: ret_i8
+; CHECK-COMMON-NOT: uxt
+define i8 @ret_i8() {
+entry:
+ br label %loop
+
+loop:
+ %val = phi i8 [ 0, %entry ], [ %inc2, %if.end ]
+ %cmp = icmp ult i8 %val, 128
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %inc = add nuw i8 %val, 2
+ br label %if.end
+
+if.else:
+ %inc1 = add nuw i8 %val, 1
+ br label %if.end
+
+if.end:
+ %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ]
+ %cmp1 = icmp ult i8 %inc2, 253
+ br i1 %cmp1, label %exit, label %loop
+
+exit:
+ ret i8 %inc2
+}
+
+; Check that %exp requires uxth in all cases, and will also be required to
+; promote %1 for the call - unless we can generate a uadd16.
+; CHECK-COMMON-LABEL: zext_load_sink_call:
+; CHECK-COMMON: uxt
+; CHECK-DSP-IMM: uadd16
+; CHECK-COMMON: cmp
+; CHECK-DSP: uxt
+; CHECK-DSP-IMM-NOT: uxt
+define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) {
+entry:
+ %0 = load i16, i16* %ptr, align 4
+ %1 = add i16 %exp, 3
+ %cmp = icmp eq i16 %0, %exp
+ br i1 %cmp, label %exit, label %if.then
+
+if.then:
+ %conv0 = zext i16 %0 to i32
+ %conv1 = zext i16 %1 to i32
+ %call = tail call arm_aapcs_vfpcc i32 @dummy(i32 %conv0, i32 %conv1)
+ br label %exit
+
+exit:
+ %exitval = phi i32 [ %call, %if.then ], [ 0, %entry ]
+ ret i32 %exitval
+}
+
+
+; Check that the pass doesn't try to promote the immediate parameters.
+; CHECK-COMMON-LABEL: call_with_imms
+; CHECK-COMMON-NOT: uxt
+define i8 @call_with_imms(i8* %arg) {
+ %call = tail call arm_aapcs_vfpcc zeroext i8 @dummy2(i8* nonnull %arg, i8 zeroext 0, i8 zeroext 0)
+ %cmp = icmp eq i8 %call, 0
+ %res = select i1 %cmp, i8 %call, i8 1
+ ret i8 %res
+}
+
+; Test that the call result is still extended.
+; CHECK-COMMON-LABEL: test_call:
+; CHECK-COMMON: bl
+; CHECK-COMMONNEXT: sxtb r1, r0
+define i16 @test_call(i8 zeroext %arg) {
+ %call = call i8 @dummy_i8(i8 %arg)
+ %cmp = icmp ult i8 %call, 128
+ %conv = zext i1 %cmp to i16
+ ret i16 %conv
+}
+
+; Test that the transformation bails when it finds that i16 is larger than i8.
+; TODO: We should be able to remove the uxtb in these cases.
+; CHECK-LABEL: promote_i8_sink_i16_1
+; CHECK-COMMON: bl dummy_i8
+; CHECK-COMMON: adds r0, #1
+; CHECK-COMMON: uxtb r0, r0
+; CHECK-COMMON: cmp r0
+define i16 @promote_i8_sink_i16_1(i8 zeroext %arg0, i16 zeroext %arg1, i16 zeroext %arg2) {
+ %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
+ %add = add nuw i8 %call, 1
+ %conv = zext i8 %add to i16
+ %cmp = icmp ne i16 %conv, %arg1
+ %sel = select i1 %cmp, i16 %arg1, i16 %arg2
+ %res = tail call zeroext i16 @dummy3(i16 %sel)
+ ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: promote_i8_sink_i16_2
+; CHECK-COMMON: bl dummy_i8
+; CHECK-COMMON: adds r0, #1
+; CHECK-COMMON: uxtb r0, r0
+; CHECK-COMMON: cmp r0
+define i16 @promote_i8_sink_i16_2(i8 zeroext %arg0, i8 zeroext %arg1, i16 zeroext %arg2) {
+ %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
+ %add = add nuw i8 %call, 1
+ %cmp = icmp ne i8 %add, %arg1
+ %conv = zext i8 %arg1 to i16
+ %sel = select i1 %cmp, i16 %conv, i16 %arg2
+ %res = tail call zeroext i16 @dummy3(i16 %sel)
+ ret i16 %res
+}
+
+@uc = global i8 42, align 1
+@LL = global i64 0, align 8
+
+; CHECK-COMMON-LABEL: zext_i64
+; CHECK-COMMON: ldrb
+; CHECK-COMMON: strd
+define void @zext_i64() {
+entry:
+ %0 = load i8, i8* @uc, align 1
+ %conv = zext i8 %0 to i64
+ store i64 %conv, i64* @LL, align 8
+ %cmp = icmp eq i8 %0, 42
+ %conv1 = zext i1 %cmp to i32
+ %call = tail call i32 bitcast (i32 (...)* @assert to i32 (i32)*)(i32 %conv1)
+ ret void
+}
+
+@a = global i16* null, align 4
+@b = global i32 0, align 4
+
+; CHECK-COMMON-LABEL: constexpr
+; CHECK-COMMON: uxth
+define i32 @constexpr() {
+entry:
+ store i32 ptrtoint (i32* @b to i32), i32* @b, align 4
+ %0 = load i16*, i16** @a, align 4
+ %1 = load i16, i16* %0, align 2
+ %or = or i16 %1, ptrtoint (i32* @b to i16)
+ store i16 %or, i16* %0, align 2
+ %cmp = icmp ne i16 %or, 4
+ %conv3 = zext i1 %cmp to i32
+ %call = tail call i32 bitcast (i32 (...)* @e to i32 (i32)*)(i32 %conv3) #2
+ ret i32 undef
+}
+
+; Check that d.sroa.0.0.be is promoted passed directly into the tail call.
+; CHECK-COMMON-LABEL: check_zext_phi_call_arg
+; CHECK-COMMON-NOT: uxt
+define i32 @check_zext_phi_call_arg() {
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %for.cond.backedge, %entry
+ %d.sroa.0.0 = phi i16 [ 30, %entry ], [ %d.sroa.0.0.be, %for.cond.backedge ]
+ %tobool = icmp eq i16 %d.sroa.0.0, 0
+ br i1 %tobool, label %for.cond.backedge, label %if.then
+
+for.cond.backedge: ; preds = %for.cond, %if.then
+ %d.sroa.0.0.be = phi i16 [ %call, %if.then ], [ 0, %for.cond ]
+ br label %for.cond
+
+if.then: ; preds = %for.cond
+ %d.sroa.0.0.insert.ext = zext i16 %d.sroa.0.0 to i32
+ %call = tail call zeroext i16 bitcast (i16 (...)* @f to i16 (i32)*)(i32 %d.sroa.0.0.insert.ext) #2
+ br label %for.cond.backedge
+}
+
+
+; The call to safe_lshift_func takes two parameters, but they're the same value just one is zext.
+; CHECK-COMMON-LABEL: call_zext_i8_i32
+define fastcc i32 @call_zext_i8_i32(i32 %p_45, i8 zeroext %p_46) {
+for.cond8.preheader:
+ %call217 = call fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 zeroext undef)
+ %tobool219 = icmp eq i8 %call217, 0
+ br i1 %tobool219, label %for.end411, label %for.cond273.preheader
+
+for.cond273.preheader: ; preds = %for.cond8.preheader
+ %call217.lcssa = phi i8 [ %call217, %for.cond8.preheader ]
+ %conv218.le = zext i8 %call217.lcssa to i32
+ %call346 = call fastcc zeroext i8 @safe_lshift_func(i8 zeroext %call217.lcssa, i32 %conv218.le)
+ unreachable
+
+for.end411: ; preds = %for.cond8.preheader
+ %call452 = call fastcc i64 @safe_sub_func_int64_t_s_s(i64 undef, i64 4)
+ unreachable
+}
+
+%struct.anon = type { i32 }
+
+@g_57 = hidden local_unnamed_addr global %struct.anon zeroinitializer, align 4
+@g_893 = hidden local_unnamed_addr global %struct.anon zeroinitializer, align 4
+@g_82 = hidden local_unnamed_addr global i32 0, align 4
+
+; Test that the transform bails on finding a call which returns a i16**
+; CHECK-COMMON-LABEL: call_return_pointer
+; CHECK-COMMON: sxth
+; CHECK-COMMON-NOT: uxt
+define hidden i32 @call_return_pointer(i8 zeroext %p_13) local_unnamed_addr #0 {
+entry:
+ %conv1 = zext i8 %p_13 to i16
+ %call = tail call i16** @func_62(i8 zeroext undef, i32 undef, i16 signext %conv1, i32* undef)
+ %0 = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @g_893, i32 0, i32 0), align 4
+ %conv2 = trunc i32 %0 to i16
+ br label %for.cond
+
+for.cond: ; preds = %for.cond.backedge, %entry
+ %p_13.addr.0 = phi i8 [ %p_13, %entry ], [ %p_13.addr.0.be, %for.cond.backedge ]
+ %tobool = icmp eq i8 %p_13.addr.0, 0
+ br i1 %tobool, label %for.cond.backedge, label %if.then
+
+for.cond.backedge: ; preds = %for.cond, %if.then
+ %p_13.addr.0.be = phi i8 [ %conv4, %if.then ], [ 0, %for.cond ]
+ br label %for.cond
+
+if.then: ; preds = %for.cond
+ %call3 = tail call fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %conv2)
+ %conv4 = trunc i16 %call3 to i8
+ br label %for.cond.backedge
+}
+
+declare noalias i16** @func_62(i8 zeroext %p_63, i32 %p_64, i16 signext %p_65, i32* nocapture readnone %p_66)
+declare fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %si2)
+declare dso_local fastcc i64 @safe_sub_func_int64_t_s_s(i64, i64)
+declare dso_local fastcc zeroext i8 @safe_lshift_func(i8 zeroext, i32)
+declare dso_local fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 returned zeroext)
+
+declare dso_local i32 @e(...) local_unnamed_addr #1
+declare dso_local zeroext i16 @f(...) local_unnamed_addr #1
+
+declare i32 @dummy(i32, i32)
+declare i8 @dummy_i8(i8)
+declare i8 @dummy2(i8*, i8, i8)
+declare i16 @dummy3(i16)
+declare i32 @assert(...)
--- /dev/null
+; RUN: llc -mtriple=thumbv7m %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv7 %s -o - | FileCheck %s
+; RUN: llc -mtriple=armv8 %s -o - | FileCheck %s
+
+; Test to check that ARMCodeGenPrepare doesn't optimised away sign extends.
+; CHECK-LABEL: test_signed_load:
+; CHECK: uxth
+define i16 @test_signed_load(i16* %ptr) {
+ %load = load i16, i16* %ptr
+ %conv0 = zext i16 %load to i32
+ %conv1 = sext i16 %load to i32
+ %cmp = icmp eq i32 %conv0, %conv1
+ %conv2 = zext i1 %cmp to i16
+ ret i16 %conv2
+}
+
+; Don't allow sign bit generating opcodes.
+; CHECK-LABEL: test_ashr:
+; CHECK: sxth
+define i16 @test_ashr(i16 zeroext %arg) {
+ %ashr = ashr i16 %arg, 1
+ %cmp = icmp eq i16 %ashr, 0
+ %conv = zext i1 %cmp to i16
+ ret i16 %conv
+}
+
+; CHECK-LABEL: test_sdiv:
+; CHECK: sxth
+define i16 @test_sdiv(i16 zeroext %arg) {
+ %sdiv = sdiv i16 %arg, 2
+ %cmp = icmp ne i16 %sdiv, 0
+ %conv = zext i1 %cmp to i16
+ ret i16 %conv
+}
+
+; CHECK-LABEL: test_srem
+; CHECK: sxth
+define i16 @test_srem(i16 zeroext %arg) {
+ %srem = srem i16 %arg, 4
+ %cmp = icmp ne i16 %srem, 0
+ %conv = zext i1 %cmp to i16
+ ret i16 %conv
+}
+