From 3e9f78170eb0c485d07e5006987835c1917e81f2 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 3 Oct 2019 11:33:50 +0000 Subject: [PATCH] [AArch64] Static (de)allocation of SVE stack objects. Adds support to AArch64FrameLowering to allocate fixed-stack SVE objects. The focus of this patch is purely to allow the stack frame to allocate/deallocate space for scalable SVE objects. More dynamic allocation (at compile-time, i.e. determining placement of SVE objects on the stack), or resolving frame-index references that include scalable-sized offsets, are left for subsequent patches. SVE objects are allocated in the stack frame as a separate region below the callee-save area, and above the alignment gap. This is done so that the SVE objects can be accessed directly from the FP at (runtime) VL-based offsets to benefit from using the VL-scaled addressing modes. The layout looks as follows: +-------------+ | stack arg | +-------------+ | Callee Saves| | X29, X30 | (if available) |-------------| <- FP (if available) | : | | SVE area | | : | +-------------+ |/////////////| alignment gap. | : | | Stack objs | | : | +-------------+ <- SP after call and frame-setup SVE and non-SVE stack objects are distinguished using different StackIDs. The offsets for objects with TargetStackID::SVEVector should be interpreted as purely scalable offsets within their respective SVE region. Reviewers: thegameg, rovka, t.p.northover, efriedma, rengolin, greened Reviewed By: efriedma Differential Revision: https://reviews.llvm.org/D61437 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373585 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/MIRYamlMapping.h | 1 + include/llvm/CodeGen/TargetFrameLowering.h | 1 + lib/Target/AArch64/AArch64FrameLowering.cpp | 77 ++++++++++- lib/Target/AArch64/AArch64FrameLowering.h | 11 ++ lib/Target/AArch64/AArch64InstrInfo.cpp | 31 ++++- .../AArch64/AArch64MachineFunctionInfo.h | 16 +++ lib/Target/AArch64/AArch64StackOffset.h | 49 +++++-- lib/Target/AMDGPU/SIFrameLowering.cpp | 2 + test/CodeGen/AArch64/framelayout-sve.mir | 121 ++++++++++++++++++ unittests/Target/AArch64/TestStackOffset.cpp | 75 ++++++++++- 10 files changed, 369 insertions(+), 15 deletions(-) create mode 100644 test/CodeGen/AArch64/framelayout-sve.mir diff --git a/include/llvm/CodeGen/MIRYamlMapping.h b/include/llvm/CodeGen/MIRYamlMapping.h index 94e76a75e8d..069d0aa4509 100644 --- a/include/llvm/CodeGen/MIRYamlMapping.h +++ b/include/llvm/CodeGen/MIRYamlMapping.h @@ -314,6 +314,7 @@ struct ScalarEnumerationTraits { static void enumeration(yaml::IO &IO, TargetStackID::Value &ID) { IO.enumCase(ID, "default", TargetStackID::Default); IO.enumCase(ID, "sgpr-spill", TargetStackID::SGPRSpill); + IO.enumCase(ID, "sve-vec", TargetStackID::SVEVector); IO.enumCase(ID, "noalloc", TargetStackID::NoAlloc); } }; diff --git a/include/llvm/CodeGen/TargetFrameLowering.h b/include/llvm/CodeGen/TargetFrameLowering.h index 284f7ba64db..6e4a723b426 100644 --- a/include/llvm/CodeGen/TargetFrameLowering.h +++ b/include/llvm/CodeGen/TargetFrameLowering.h @@ -28,6 +28,7 @@ namespace TargetStackID { enum Value { Default = 0, SGPRSpill = 1, + SVEVector = 2, NoAlloc = 255 }; } diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 8357b763179..c42c16bc1aa 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -55,6 +55,10 @@ // | callee-saved fp/simd/SVE regs | // | | // |-----------------------------------| +// | | +// | SVE stack objects | +// | | +// |-----------------------------------| // |.empty.space.to.make.part.below....| // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at // |.the.standard.16-byte.alignment....| compile time; if present) @@ -202,6 +206,12 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { return DefaultSafeSPDisplacement; } +/// Returns the size of the entire SVE stackframe (calleesaves + spills). +static StackOffset getSVEStackSize(const MachineFunction &MF) { + const AArch64FunctionInfo *AFI = MF.getInfo(); + return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8}; +} + bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; @@ -214,7 +224,8 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo(); unsigned NumBytes = AFI->getLocalStackSize(); - return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128); + return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 || + getSVEStackSize(MF)); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -456,6 +467,11 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( if (canUseRedZone(MF)) return false; + // When there is an SVE area on the stack, always allocate the + // callee-saves and spills/locals separately. + if (getSVEStackSize(MF)) + return false; + return true; } @@ -870,6 +886,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Ideally it should match SP value after prologue. AFI->setTaggedBasePointerOffset(MFI.getStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // getStackSize() includes all the locals in its size calculation. We don't // include these locals when computing the stack size of a funclet, as they // are allocated in the parent's stack frame and accessed via the frame @@ -880,6 +898,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, : (int)MFI.getStackSize(); if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); + assert(!SVEStackSize && + "unexpected function without stack frame but with SVE objects"); // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); if (!NumBytes) @@ -926,6 +946,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); if (CombineSPBump) { + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); @@ -1083,6 +1104,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, NumBytes = 0; } + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII, + MachineInstr::FrameSetup); + // Allocate space for the rest of the frame. if (NumBytes) { const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); @@ -1431,8 +1455,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameDestroy); } + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); @@ -1446,6 +1473,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); + // Deallocate the SVE area. + if (SVEStackSize) + if (!AFI->isStackRealigned()) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize, + TII, MachineInstr::FrameDestroy); + if (!hasFP(MF)) { bool RedZone = canUseRedZone(MF); // If this was a redzone leaf function, we don't need to restore the @@ -1595,6 +1628,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( bool isCSR = !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + if (SVEStackSize) + llvm_unreachable("Accessing frame indices in presence of SVE " + "not yet supported"); + // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't // reliable as a base). Make sure useFPForScavengingIndex() does the @@ -2175,8 +2213,19 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, << ' ' << printReg(Reg, RegInfo); dbgs() << "\n";); + bool HasSVEStackObjects = [&MFI]() { + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) + if (MFI.getStackID(I) == TargetStackID::SVEVector && + MFI.getObjectOffset(I) < 0) + return true; + // Note: We don't take allocatable stack objects into + // account yet, because allocation for those is not yet + // implemented. + return false; + }(); + // If any callee-saved registers are used, the frame cannot be eliminated. - bool CanEliminateFrame = SavedRegs.count() == 0; + bool CanEliminateFrame = (SavedRegs.count() == 0) && !HasSVEStackObjects; // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. @@ -2239,12 +2288,34 @@ bool AArch64FrameLowering::enableStackSlotScavenging( void AArch64FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); + + assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && + "Upwards growing stack unsupported"); + + // Process all fixed stack SVE objects. + int64_t Offset = 0; + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) { + unsigned StackID = MFI.getStackID(I); + if (StackID == TargetStackID::SVEVector) { + int64_t FixedOffset = -MFI.getObjectOffset(I); + if (FixedOffset > Offset) + Offset = FixedOffset; + } + } + + unsigned MaxAlign = getStackAlignment(); + uint64_t SVEStackSize = alignTo(Offset, MaxAlign); + + AArch64FunctionInfo *AFI = MF.getInfo(); + AFI->setStackSizeSVE(SVEStackSize); + assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. if (!MF.hasEHFunclets()) return; const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - MachineFrameInfo &MFI = MF.getFrameInfo(); WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); MachineBasicBlock &MBB = MF.front(); diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index 7ed20d24607..99d868a95a7 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -87,6 +87,17 @@ public: int FI) const override; int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const; + bool isSupportedStackID(TargetStackID::Value ID) const override { + switch (ID) { + default: + return false; + case TargetStackID::Default: + case TargetStackID::SVEVector: + case TargetStackID::NoAlloc: + return true; + } + } + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, unsigned StackBumpBytes) const; diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 097a8ba0ae1..1cc3177b26a 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3046,6 +3046,16 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MaxEncoding = 0xfff; ShiftSize = 12; break; + case AArch64::ADDVL_XXI: + case AArch64::ADDPL_XXI: + MaxEncoding = 31; + ShiftSize = 0; + if (Offset < 0) { + MaxEncoding = 32; + Sign = -1; + Offset = -Offset; + } + break; default: llvm_unreachable("Unsupported opcode"); } @@ -3117,8 +3127,8 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool SetNZCV, bool NeedsWinCFI, bool *HasWinCFI) { - int64_t Bytes; - Offset.getForFrameOffset(Bytes); + int64_t Bytes, NumPredicateVectors, NumDataVectors; + Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors); // First emit non-scalable frame offsets, or a simple 'mov'. if (Bytes || (!Offset && SrcReg != DestReg)) { @@ -3133,6 +3143,23 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, NeedsWinCFI, HasWinCFI); SrcReg = DestReg; } + + assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && + "SetNZCV not supported with SVE vectors"); + assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && + "WinCFI not supported with SVE vectors"); + + if (NumDataVectors) { + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, + AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); + SrcReg = DestReg; + } + + if (NumPredicateVectors) { + assert(DestReg != AArch64::SP && "Unaligned access to SP"); + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, + AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); + } } MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 0efeeb272ec..a7d0a742573 100644 --- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -95,6 +95,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// returned struct in a register. This field holds the virtual register into /// which the sret argument is passed. unsigned SRetReturnReg = 0; + /// SVE stack size (for predicates and data vectors) are maintained here + /// rather than in FrameInfo, as the placement and Stack IDs are target + /// specific. + uint64_t StackSizeSVE = 0; + + /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid. + bool HasCalculatedStackSizeSVE = false; /// Has a value when it is known whether or not the function uses a /// redzone, and no value otherwise. @@ -131,6 +138,15 @@ public: ArgumentStackToRestore = bytes; } + bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } + + void setStackSizeSVE(uint64_t S) { + HasCalculatedStackSizeSVE = true; + StackSizeSVE = S; + } + + uint64_t getStackSizeSVE() const { return StackSizeSVE; } + bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } diff --git a/lib/Target/AArch64/AArch64StackOffset.h b/lib/Target/AArch64/AArch64StackOffset.h index 5f5cdfa2fad..13f12a6c9c3 100644 --- a/lib/Target/AArch64/AArch64StackOffset.h +++ b/lib/Target/AArch64/AArch64StackOffset.h @@ -35,32 +35,38 @@ namespace llvm { /// vector and a 64bit GPR. class StackOffset { int64_t Bytes; + int64_t ScalableBytes; explicit operator int() const; public: using Part = std::pair; - StackOffset() : Bytes(0) {} + StackOffset() : Bytes(0), ScalableBytes(0) {} StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() { - assert(!MVT(T).isScalableVector() && "Scalable types not supported"); + assert(MVT(T).getSizeInBits() % 8 == 0 && + "Offset type is not a multiple of bytes"); *this += Part(Offset, T); } - StackOffset(const StackOffset &Other) : Bytes(Other.Bytes) {} + StackOffset(const StackOffset &Other) + : Bytes(Other.Bytes), ScalableBytes(Other.ScalableBytes) {} StackOffset &operator=(const StackOffset &) = default; StackOffset &operator+=(const StackOffset::Part &Other) { - assert(Other.second.getSizeInBits() % 8 == 0 && - "Offset type is not a multiple of bytes"); - Bytes += Other.first * (Other.second.getSizeInBits() / 8); + int64_t OffsetInBytes = Other.first * (Other.second.getSizeInBits() / 8); + if (Other.second.isScalableVector()) + ScalableBytes += OffsetInBytes; + else + Bytes += OffsetInBytes; return *this; } StackOffset &operator+=(const StackOffset &Other) { Bytes += Other.Bytes; + ScalableBytes += Other.ScalableBytes; return *this; } @@ -72,6 +78,7 @@ public: StackOffset &operator-=(const StackOffset &Other) { Bytes -= Other.Bytes; + ScalableBytes -= Other.ScalableBytes; return *this; } @@ -88,16 +95,42 @@ public: return Res; } + /// Returns the scalable part of the offset in bytes. + int64_t getScalableBytes() const { return ScalableBytes; } + /// Returns the non-scalable part of the offset in bytes. int64_t getBytes() const { return Bytes; } /// Returns the offset in parts to which this frame offset can be /// decomposed for the purpose of describing a frame offset. /// For non-scalable offsets this is simply its byte size. - void getForFrameOffset(int64_t &ByteSized) const { ByteSized = Bytes; } + void getForFrameOffset(int64_t &NumBytes, int64_t &NumPredicateVectors, + int64_t &NumDataVectors) const { + assert(isValid() && "Invalid frame offset"); + + NumBytes = Bytes; + NumDataVectors = 0; + NumPredicateVectors = ScalableBytes / 2; + // This method is used to get the offsets to adjust the frame offset. + // If the function requires ADDPL to be used and needs more than two ADDPL + // instructions, part of the offset is folded into NumDataVectors so that it + // uses ADDVL for part of it, reducing the number of ADDPL instructions. + if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || + NumPredicateVectors > 62) { + NumDataVectors = NumPredicateVectors / 8; + NumPredicateVectors -= NumDataVectors * 8; + } + } /// Returns whether the offset is known zero. - explicit operator bool() const { return Bytes; } + explicit operator bool() const { return Bytes || ScalableBytes; } + + bool isValid() const { + // The smallest scalable element supported by scaled SVE addressing + // modes are predicates, which are 2 scalable bytes in size. So the scalable + // byte offset must always be a multiple of 2. + return ScalableBytes % 2 == 0; + } }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 22f035e7f3e..ed07ed100a1 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -673,6 +673,8 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { case TargetStackID::NoAlloc: case TargetStackID::SGPRSpill: return true; + case TargetStackID::SVEVector: + return false; } llvm_unreachable("Invalid TargetStackID::Value"); } diff --git a/test/CodeGen/AArch64/framelayout-sve.mir b/test/CodeGen/AArch64/framelayout-sve.mir new file mode 100644 index 00000000000..9009a6a29bf --- /dev/null +++ b/test/CodeGen/AArch64/framelayout-sve.mir @@ -0,0 +1,121 @@ +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s +# +# Test allocation and deallocation of SVE objects on the stack, +# as well as using a combination of scalable and non-scalable +# offsets to access the SVE on the stack. +# +# SVE objects are allocated below the (scalar) callee saves, +# and above spills/locals and the alignment gap, e.g. +# +# +-------------+ +# | stack arg | +# +-------------+ <- SP before call +# | Callee Saves| +# | Frame record| (if available) +# |-------------| <- FP (if available) +# | SVE area | +# +-------------+ +# |/////////////| alignment gap. +# | : | +# | Stack objs | +# | : | +# +-------------+ <- SP after call and frame-setup +# +--- | + + define void @test_allocate_sve() nounwind { entry: unreachable } + define void @test_allocate_sve_gpr_callee_saves() nounwind { entry: unreachable } + define void @test_allocate_sve_gpr_realigned() nounwind { entry: unreachable } + +... +# +----------+ +# | %fixed- | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | stack.0 | // to be materialized with 2*ADDVL (<=> 2 * n * 16bytes) +# +----------+ +# | %stack.0 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_allocate_sve +# CHECK: stackSize: 16 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 + +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 +# CHECK-NEXT: RET_ReallyLR +name: test_allocate_sve +fixedStack: + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } +stack: + - { id: 0, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + RET_ReallyLR +--- +... +# +----------+ +# | x20, x21 | // callee saves +# +----------+ +# | %fixed- | // scalable objects +# | stack.0 | +# +----------+ +# | %stack.0 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_allocate_sve_gpr_callee_saves +# CHECK: stackSize: 32 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup STPXpre killed $x21, killed $x20, $sp, -2 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: $x20 = IMPLICIT_DEF +# CHECK-NEXT: $x21 = IMPLICIT_DEF +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 +# CHECK-NEXT: $sp, $x21, $x20 = frame-destroy LDPXpost $sp, 2 +# CHECK-NEXT: RET_ReallyLR +name: test_allocate_sve_gpr_callee_saves +fixedStack: + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } +stack: + - { id: 0, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + $x20 = IMPLICIT_DEF + $x21 = IMPLICIT_DEF + RET_ReallyLR +--- +... +# +----------+ +# | lr, fp | // frame record +# +----------+ <- FP +# | %fixed- | // scalable objects +# | stack.0 | +# +----------+ +# |//////////| // alignment gap +# | %stack.0 | // not scalable +# +----------+ <- SP +# CHECK-LABEL: name: test_allocate_sve_gpr_realigned +# CHECK: stackSize: 32 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] +# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 +# CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 +# CHECK-NEXT: RET_ReallyLR +name: test_allocate_sve_gpr_realigned +fixedStack: + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } +stack: + - { id: 0, stack-id: default, size: 16, alignment: 32 } +body: | + bb.0.entry: + RET_ReallyLR +--- diff --git a/unittests/Target/AArch64/TestStackOffset.cpp b/unittests/Target/AArch64/TestStackOffset.cpp index 240cec9f2d0..c85135ef660 100644 --- a/unittests/Target/AArch64/TestStackOffset.cpp +++ b/unittests/Target/AArch64/TestStackOffset.cpp @@ -20,6 +20,15 @@ TEST(StackOffset, MixedSize) { StackOffset C(2, MVT::v4i64); EXPECT_EQ(64, C.getBytes()); + + StackOffset D(2, MVT::nxv4i64); + EXPECT_EQ(64, D.getScalableBytes()); + + StackOffset E(2, MVT::v4i64); + EXPECT_EQ(0, E.getScalableBytes()); + + StackOffset F(2, MVT::nxv4i64); + EXPECT_EQ(0, F.getBytes()); } TEST(StackOffset, Add) { @@ -31,6 +40,11 @@ TEST(StackOffset, Add) { StackOffset D(1, MVT::i32); D += A; EXPECT_EQ(12, D.getBytes()); + + StackOffset E(1, MVT::nxv1i32); + StackOffset F = C + E; + EXPECT_EQ(12, F.getBytes()); + EXPECT_EQ(4, F.getScalableBytes()); } TEST(StackOffset, Sub) { @@ -42,6 +56,12 @@ TEST(StackOffset, Sub) { StackOffset D(1, MVT::i64); D -= A; EXPECT_EQ(0, D.getBytes()); + + C += StackOffset(2, MVT::nxv1i32); + StackOffset E = StackOffset(1, MVT::nxv1i32); + StackOffset F = C - E; + EXPECT_EQ(4, F.getBytes()); + EXPECT_EQ(4, F.getScalableBytes()); } TEST(StackOffset, isZero) { @@ -49,12 +69,63 @@ TEST(StackOffset, isZero) { StackOffset B(0, MVT::i32); EXPECT_TRUE(!A); EXPECT_TRUE(!(A + B)); + + StackOffset C(0, MVT::nxv1i32); + EXPECT_TRUE(!(A + C)); + + StackOffset D(1, MVT::nxv1i32); + EXPECT_FALSE(!(A + D)); +} + +TEST(StackOffset, isValid) { + EXPECT_FALSE(StackOffset(1, MVT::nxv8i1).isValid()); + EXPECT_TRUE(StackOffset(2, MVT::nxv8i1).isValid()); + +#ifndef NDEBUG +#ifdef GTEST_HAS_DEATH_TEST + EXPECT_DEATH(StackOffset(1, MVT::i1), + "Offset type is not a multiple of bytes"); + EXPECT_DEATH(StackOffset(1, MVT::nxv1i1), + "Offset type is not a multiple of bytes"); +#endif // defined GTEST_HAS_DEATH_TEST +#endif // not defined NDEBUG } TEST(StackOffset, getForFrameOffset) { StackOffset A(1, MVT::i64); StackOffset B(1, MVT::i32); - int64_t ByteSized; - (A + B).getForFrameOffset(ByteSized); + StackOffset C(1, MVT::nxv4i32); + + // If all offsets can be materialized with only ADDVL, + // make sure PLSized is 0. + int64_t ByteSized, VLSized, PLSized; + (A + B + C).getForFrameOffset(ByteSized, PLSized, VLSized); EXPECT_EQ(12, ByteSized); + EXPECT_EQ(1, VLSized); + EXPECT_EQ(0, PLSized); + + // If we need an ADDPL to materialize the offset, and the number of scalable + // bytes fits the ADDPL immediate, fold the scalable bytes to fit in PLSized. + StackOffset D(1, MVT::nxv16i1); + (C + D).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(0, VLSized); + EXPECT_EQ(9, PLSized); + + StackOffset E(4, MVT::nxv4i32); + StackOffset F(1, MVT::nxv16i1); + (E + F).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(0, VLSized); + EXPECT_EQ(33, PLSized); + + // If the offset requires an ADDPL instruction to materialize, and would + // require more than two instructions, decompose it into both + // ADDVL (n x 16 bytes) and ADDPL (n x 2 bytes) instructions. + StackOffset G(8, MVT::nxv4i32); + StackOffset H(1, MVT::nxv16i1); + (G + H).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(8, VLSized); + EXPECT_EQ(1, PLSized); } -- 2.40.0