From 9889150b54b474b85aa6b8c879c9729680f21cc6 Mon Sep 17 00:00:00 2001 From: Pengfei Wang Date: Fri, 26 Jul 2019 07:33:15 +0000 Subject: [PATCH] [WinEH] Allocate space in funclets stack to save XMM CSRs Summary: This is an alternate approach to D57970. Currently funclets reuse the same stack slots that are used in the parent function for saving callee-saved xmm registers. If the parent function modifies a callee-saved xmm register before an excpetion is thrown, the catch handler will overwrite the original saved value. This patch allocates space in funclets stack for saving callee-saved xmm registers and uses RSP instead RBP to access memory. Reviewers: andrew.w.kaylor, LuoYuanke, annita.zhang, craig.topper, RKSimon Subscribers: rnk, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D63396 Signed-off-by: pengfei git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@367088 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86FrameLowering.cpp | 133 +++++++++++++++---- lib/Target/X86/X86FrameLowering.h | 4 + lib/Target/X86/X86MachineFunctionInfo.h | 13 ++ test/CodeGen/X86/avx512-intel-ocl.ll | 56 ++++---- test/CodeGen/X86/catchpad-realign-savexmm.ll | 15 +++ test/CodeGen/X86/win64-funclet-savexmm.ll | 68 ++++++++++ test/CodeGen/X86/x86-interrupt_cc.ll | 12 +- 7 files changed, 244 insertions(+), 57 deletions(-) create mode 100644 test/CodeGen/X86/win64-funclet-savexmm.ll diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 1fb6eb33872..aff686e9cae 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -935,7 +935,10 @@ bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const { ; calls @llvm.eh.unwind.init [if needs FP] [for all callee-saved XMM registers] - movaps %, -MMM(%rbp) + [if funclet] + movaps %, -MMM(%rsp) + [else] + movaps %, -MMM(%rbp) [for all callee-saved XMM registers] .seh_savexmm %, (-MMM + SEHFrameOffset) ; i.e. the offset relative to (%rbp - SEHFrameOffset) @@ -955,7 +958,10 @@ bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const { ; Emit CFI info [if needs FP] [for all callee-saved registers] - .cfi_offset %, (offset from %rbp) + [if funclet] + movaps -MMM(%rsp), % + [else] + .cfi_offset %, (offset from %rbp) [else] .cfi_def_cfa_offset (offset from RETADDR) [for all callee-saved registers] @@ -1177,11 +1183,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, MFI.setOffsetAdjustment(-StackSize); } - // For EH funclets, only allocate enough space for outgoing calls. Save the - // NumBytes value that we would've used for the parent frame. + // For EH funclets, only allocate enough space for outgoing calls and callee + // saved XMM registers on Windows 64 bits. Save the NumBytes value that we + // would've used for the parent frame. + int XMMFrameSlotOrigin; unsigned ParentFrameNumBytes = NumBytes; - if (IsFunclet) + if (IsFunclet) { NumBytes = getWinEHFuncletFrameSize(MF); + if (IsWin64Prologue) + NumBytes += X86FI->getCalleeSavedXMMFrameInfo(XMMFrameSlotOrigin); + } // Skip the callee-saved push instructions. bool PushedRegs = false; @@ -1389,19 +1400,33 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, } while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { - const MachineInstr &FrameInstr = *MBBI; + auto FrameInstr = MBBI; ++MBBI; if (NeedsWinCFI) { int FI; - if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { + if (unsigned Reg = TII.isStoreToStackSlot(*FrameInstr, FI)) { if (X86::FR64RegClass.contains(Reg)) { - unsigned IgnoredFrameReg; - int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); - Offset += SEHFrameOffset; - + int Offset = 0; HasWinCFI = true; - assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data"); + if (IsFunclet) { + assert(IsWin64Prologue && "Only valid on Windows 64bit"); + unsigned Size = TRI->getSpillSize(X86::VR128RegClass); + unsigned Align = TRI->getSpillAlignment(X86::VR128RegClass); + Offset = (FI - XMMFrameSlotOrigin - 1) * Size + + alignDown(NumBytes, Align); + addRegOffset(BuildMI(MBB, MBBI, DL, + TII.get(getXMMAlignedLoadStoreOp(false))), + StackPtr, true, Offset) + .addReg(Reg) + .setMIFlag(MachineInstr::FrameSetup); + MBB.erase(FrameInstr); + } else { + assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data"); + unsigned IgnoredFrameReg; + Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg) + + SEHFrameOffset; + } BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) .addImm(Reg) .addImm(Offset) @@ -1621,6 +1646,9 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (IsFunclet) { assert(HasFP && "EH funclets without FP not yet implemented"); NumBytes = getWinEHFuncletFrameSize(MF); + int Ignore; + if (IsWin64Prologue) + NumBytes += X86FI->getCalleeSavedXMMFrameInfo(Ignore); } else if (HasFP) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; @@ -1948,6 +1976,8 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( X86MachineFunctionInfo *X86FI = MF.getInfo(); unsigned CalleeSavedFrameSize = 0; + unsigned CalleeSavedXMMFrameSize = 0; + int CalleeSavedXMMSlotOrigin = 0; int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); @@ -2011,9 +2041,44 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize); // Assign slots for XMMs. + for (unsigned i = CSI.size(), Size = 0; i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + // According to Microsoft "x64 software conventions", only XMM registers + // are nonvolatile except the GPR. + if (!X86::VR128RegClass.contains(Reg)) + continue; + // Since all registers have the same size, we just initialize once. + if (Size == 0) { + unsigned Align = TRI->getSpillAlignment(X86::VR128RegClass); + // ensure alignment + int Remainder = SpillSlotOffset % Align; + if (Remainder < 0) + SpillSlotOffset -= Align + Remainder; + else + SpillSlotOffset -= Remainder; + MFI.ensureMaxAlignment(Align); + Size = TRI->getSpillSize(X86::VR128RegClass); + } + // spill into slot + SpillSlotOffset -= Size; + int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset); + CSI[i - 1].setFrameIdx(SlotIndex); + // Since we allocate XMM slot consecutively in stack, we just need to + // record the first one for the funclet use. + if (CalleeSavedXMMFrameSize == 0) { + CalleeSavedXMMSlotOrigin = SlotIndex; + } + CalleeSavedXMMFrameSize += Size; + } + + X86FI->setCalleeSavedXMMFrameInfo(CalleeSavedXMMFrameSize, + CalleeSavedXMMSlotOrigin); + + // Assign slots for others. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i - 1].getReg(); - if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg) || + X86::VR128RegClass.contains(Reg)) continue; // If this is k-register make sure we lookup via the largest legal type. @@ -2025,7 +2090,11 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( unsigned Size = TRI->getSpillSize(*RC); unsigned Align = TRI->getSpillAlignment(*RC); // ensure alignment - SpillSlotOffset -= std::abs(SpillSlotOffset) % Align; + int Remainder = SpillSlotOffset % Align; + if (Remainder < 0) + SpillSlotOffset -= Align + Remainder; + else + SpillSlotOffset -= Remainder; // spill into slot SpillSlotOffset -= Size; int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset); @@ -2164,19 +2233,32 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, DebugLoc DL = MBB.findDebugLoc(MI); // Reload XMMs from stack frame. + MachineFunction &MF = *MBB.getParent(); + X86MachineFunctionInfo *X86FI = MF.getInfo(); + int XMMFrameSlotOrigin; + int SEHFrameOffset = X86FI->getCalleeSavedXMMFrameInfo(XMMFrameSlotOrigin) + + MF.getFrameInfo().getMaxCallFrameSize(); for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); - if (X86::GR64RegClass.contains(Reg) || - X86::GR32RegClass.contains(Reg)) - continue; + if (MBB.isEHFuncletEntry() && STI.is64Bit()) { + if (X86::VR128RegClass.contains(Reg)) { + int Offset = (CSI[i].getFrameIdx() - XMMFrameSlotOrigin - 1) * 16; + addRegOffset(BuildMI(MBB, MI, DL, + TII.get(getXMMAlignedLoadStoreOp(true)), Reg), + X86::RSP, true, SEHFrameOffset + Offset); + } + } else { + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + continue; - // If this is k-register make sure we lookup via the largest legal type. - MVT VT = MVT::Other; - if (X86::VK16RegClass.contains(Reg)) - VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; + // If this is k-register make sure we lookup via the largest legal type. + MVT VT = MVT::Other; + if (X86::VK16RegClass.contains(Reg)) + VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); + } } // POP GPRs. @@ -3185,3 +3267,8 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized( UnwindHelpFI) .addImm(-2); } + +unsigned X86FrameLowering::getXMMAlignedLoadStoreOp(const bool IsLoad) const { + return IsLoad ? (STI.hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm) + : (STI.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr); +} diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index d32746e3a36..5e534461177 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -217,6 +217,10 @@ private: void emitCatchRetReturnValue(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineInstr *CatchRet) const; + + /// Select the best opcode for the subtarget when funclet XMM CSRs + /// save/restore. + unsigned getXMMAlignedLoadStoreOp(const bool IsLoad) const; }; } // End llvm namespace diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h index d7e535598d8..a160f618b23 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.h +++ b/lib/Target/X86/X86MachineFunctionInfo.h @@ -40,6 +40,14 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// stack frame in bytes. unsigned CalleeSavedFrameSize = 0; + /// CalleeSavedXMMFrameSize - Size of the callee-saved XMM register portion + /// of the stack frame in bytes. + unsigned CalleeSavedXMMFrameSize = 0; + + /// CalleeSavedXMMFrameOrigin - Origin slot of the callee-saved XMM register + /// portion of the stack frame. + int CalleeSavedXMMFrameOrigin = 0; + /// BytesToPopOnReturn - Number of bytes function pops on return (in addition /// to the space used by the return address). /// Used on windows platform for stdcall & fastcall name decoration @@ -123,6 +131,11 @@ public: unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; } + unsigned getCalleeSavedXMMFrameInfo(int &origin) const + { origin = CalleeSavedXMMFrameOrigin; return CalleeSavedXMMFrameSize; } + void setCalleeSavedXMMFrameInfo(unsigned size, int origin) + { CalleeSavedXMMFrameSize = size; CalleeSavedXMMFrameOrigin = origin; } + unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; } void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;} diff --git a/test/CodeGen/X86/avx512-intel-ocl.ll b/test/CodeGen/X86/avx512-intel-ocl.ll index defedd2a7f6..4b1b681b055 100644 --- a/test/CodeGen/X86/avx512-intel-ocl.ll +++ b/test/CodeGen/X86/avx512-intel-ocl.ll @@ -184,14 +184,14 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl ; WIN64-KNL-LABEL: test_prolog_epilog: ; WIN64-KNL: # %bb.0: ; WIN64-KNL-NEXT: pushq %rbp -; WIN64-KNL-NEXT: subq $1328, %rsp # imm = 0x530 +; WIN64-KNL-NEXT: subq $1264, %rsp # imm = 0x4F0 ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rbp -; WIN64-KNL-NEXT: kmovw %k7, 1198(%rbp) # 2-byte Spill -; WIN64-KNL-NEXT: kmovw %k6, 1196(%rbp) # 2-byte Spill -; WIN64-KNL-NEXT: kmovw %k5, 1194(%rbp) # 2-byte Spill -; WIN64-KNL-NEXT: kmovw %k4, 1192(%rbp) # 2-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm21, 1104(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm20, 992(%rbp) # 64-byte Spill +; WIN64-KNL-NEXT: kmovw %k7, 1134(%rbp) # 2-byte Spill +; WIN64-KNL-NEXT: kmovw %k6, 1132(%rbp) # 2-byte Spill +; WIN64-KNL-NEXT: kmovw %k5, 1130(%rbp) # 2-byte Spill +; WIN64-KNL-NEXT: kmovw %k4, 1128(%rbp) # 2-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm21, 1024(%rbp) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm20, 960(%rbp) # 64-byte Spill ; WIN64-KNL-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill ; WIN64-KNL-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill ; WIN64-KNL-NEXT: vmovaps %zmm17, 768(%rbp) # 64-byte Spill @@ -226,26 +226,26 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl ; WIN64-KNL-NEXT: vmovaps 768(%rbp), %zmm17 # 64-byte Reload ; WIN64-KNL-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload ; WIN64-KNL-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 992(%rbp), %zmm20 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 1104(%rbp), %zmm21 # 64-byte Reload -; WIN64-KNL-NEXT: kmovw 1192(%rbp), %k4 # 2-byte Reload -; WIN64-KNL-NEXT: kmovw 1194(%rbp), %k5 # 2-byte Reload -; WIN64-KNL-NEXT: kmovw 1196(%rbp), %k6 # 2-byte Reload -; WIN64-KNL-NEXT: kmovw 1198(%rbp), %k7 # 2-byte Reload -; WIN64-KNL-NEXT: leaq 1200(%rbp), %rsp +; WIN64-KNL-NEXT: vmovaps 960(%rbp), %zmm20 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps 1024(%rbp), %zmm21 # 64-byte Reload +; WIN64-KNL-NEXT: kmovw 1128(%rbp), %k4 # 2-byte Reload +; WIN64-KNL-NEXT: kmovw 1130(%rbp), %k5 # 2-byte Reload +; WIN64-KNL-NEXT: kmovw 1132(%rbp), %k6 # 2-byte Reload +; WIN64-KNL-NEXT: kmovw 1134(%rbp), %k7 # 2-byte Reload +; WIN64-KNL-NEXT: leaq 1136(%rbp), %rsp ; WIN64-KNL-NEXT: popq %rbp ; WIN64-KNL-NEXT: retq ; ; WIN64-SKX-LABEL: test_prolog_epilog: ; WIN64-SKX: # %bb.0: ; WIN64-SKX-NEXT: pushq %rbp -; WIN64-SKX-NEXT: subq $1328, %rsp # imm = 0x530 +; WIN64-SKX-NEXT: subq $1264, %rsp # imm = 0x4F0 ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rbp -; WIN64-SKX-NEXT: kmovq %k7, 1192(%rbp) # 8-byte Spill -; WIN64-SKX-NEXT: kmovq %k6, 1184(%rbp) # 8-byte Spill -; WIN64-SKX-NEXT: kmovq %k5, 1176(%rbp) # 8-byte Spill -; WIN64-SKX-NEXT: kmovq %k4, 1168(%rbp) # 8-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm21, 1056(%rbp) # 64-byte Spill +; WIN64-SKX-NEXT: kmovq %k7, 1128(%rbp) # 8-byte Spill +; WIN64-SKX-NEXT: kmovq %k6, 1120(%rbp) # 8-byte Spill +; WIN64-SKX-NEXT: kmovq %k5, 1112(%rbp) # 8-byte Spill +; WIN64-SKX-NEXT: kmovq %k4, 1104(%rbp) # 8-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm21, 1024(%rbp) # 64-byte Spill ; WIN64-SKX-NEXT: vmovaps %zmm20, 960(%rbp) # 64-byte Spill ; WIN64-SKX-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill ; WIN64-SKX-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill @@ -282,12 +282,12 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl ; WIN64-SKX-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload ; WIN64-SKX-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload ; WIN64-SKX-NEXT: vmovaps 960(%rbp), %zmm20 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 1056(%rbp), %zmm21 # 64-byte Reload -; WIN64-SKX-NEXT: kmovq 1168(%rbp), %k4 # 8-byte Reload -; WIN64-SKX-NEXT: kmovq 1176(%rbp), %k5 # 8-byte Reload -; WIN64-SKX-NEXT: kmovq 1184(%rbp), %k6 # 8-byte Reload -; WIN64-SKX-NEXT: kmovq 1192(%rbp), %k7 # 8-byte Reload -; WIN64-SKX-NEXT: leaq 1200(%rbp), %rsp +; WIN64-SKX-NEXT: vmovaps 1024(%rbp), %zmm21 # 64-byte Reload +; WIN64-SKX-NEXT: kmovq 1104(%rbp), %k4 # 8-byte Reload +; WIN64-SKX-NEXT: kmovq 1112(%rbp), %k5 # 8-byte Reload +; WIN64-SKX-NEXT: kmovq 1120(%rbp), %k6 # 8-byte Reload +; WIN64-SKX-NEXT: kmovq 1128(%rbp), %k7 # 8-byte Reload +; WIN64-SKX-NEXT: leaq 1136(%rbp), %rsp ; WIN64-SKX-NEXT: popq %rbp ; WIN64-SKX-NEXT: retq ; @@ -346,7 +346,7 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl ; X64-SKX: ## %bb.0: ; X64-SKX-NEXT: pushq %rsi ; X64-SKX-NEXT: pushq %rdi -; X64-SKX-NEXT: subq $1192, %rsp ## imm = 0x4A8 +; X64-SKX-NEXT: subq $1064, %rsp ## imm = 0x428 ; X64-SKX-NEXT: kmovq %k7, {{[0-9]+}}(%rsp) ## 8-byte Spill ; X64-SKX-NEXT: kmovq %k6, {{[0-9]+}}(%rsp) ## 8-byte Spill ; X64-SKX-NEXT: kmovq %k5, {{[0-9]+}}(%rsp) ## 8-byte Spill @@ -388,7 +388,7 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl ; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k5 ## 8-byte Reload ; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k6 ## 8-byte Reload ; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k7 ## 8-byte Reload -; X64-SKX-NEXT: addq $1192, %rsp ## imm = 0x4A8 +; X64-SKX-NEXT: addq $1064, %rsp ## imm = 0x428 ; X64-SKX-NEXT: popq %rdi ; X64-SKX-NEXT: popq %rsi ; X64-SKX-NEXT: retq diff --git a/test/CodeGen/X86/catchpad-realign-savexmm.ll b/test/CodeGen/X86/catchpad-realign-savexmm.ll index 1160101792f..e8bccdabdcd 100644 --- a/test/CodeGen/X86/catchpad-realign-savexmm.ll +++ b/test/CodeGen/X86/catchpad-realign-savexmm.ll @@ -51,3 +51,18 @@ catch: ; CHECK: popq %rbp ; CHECK: retq ; CHECK: .seh_handlerdata +; CHECK: # %catch +; CHECK: movq %rdx, 16(%rsp) +; CHECK: pushq %rbp +; CHECK: .seh_pushreg 5 +; CHECK: subq $48, %rsp +; CHECK: .seh_stackalloc 48 +; CHECK: leaq 64(%rdx), %rbp +; CHECK: movapd %xmm6, 32(%rsp) +; CHECK: .seh_savexmm 6, 32 +; CHECK: .seh_endprologue +; CHECK: movapd 32(%rsp), %xmm6 +; CHECK: leaq .LBB0_1(%rip), %rax +; CHECK: addq $48, %rsp +; CHECK: popq %rbp +; CHECK: retq # CATCHRET diff --git a/test/CodeGen/X86/win64-funclet-savexmm.ll b/test/CodeGen/X86/win64-funclet-savexmm.ll new file mode 100644 index 00000000000..f41c52e00ef --- /dev/null +++ b/test/CodeGen/X86/win64-funclet-savexmm.ll @@ -0,0 +1,68 @@ +; RUN: llc -mtriple=x86_64-pc-windows-msvc -mattr=+avx < %s | FileCheck %s + +; void foo(void) +; { +; __asm("nop" ::: "bx", "cx", "xmm5", "xmm6", "ymm7"); +; try { +; throw; +; } +; catch (int x) { +; } +; } + +%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] } +%eh.ThrowInfo = type { i32, i8*, i8*, i8* } + +$"??_R0H@8" = comdat any + +@"??_7type_info@@6B@" = external constant i8* +@"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat + +declare dso_local i32 @__CxxFrameHandler3(...) +declare dso_local x86_stdcallcc void @_CxxThrowException(i8*, %eh.ThrowInfo*) + +define dso_local void @"?foo@@YAXXZ"() personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { +entry: + %x = alloca i32, align 4 + call void asm sideeffect "nop", "~{bx},~{cx},~{xmm5},~{xmm6},~{ymm7}"() + invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) + to label %unreachable unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %catch] unwind to caller + +catch: ; preds = %catch.dispatch + %1 = catchpad within %0 [%rtti.TypeDescriptor2* @"??_R0H@8", i32 0, i32* %x] + catchret from %1 to label %catchret.dest + +catchret.dest: ; preds = %catch + br label %try.cont + +try.cont: ; preds = %catchret.dest + ret void + +unreachable: ; preds = %entry + unreachable +} + +; CHECK: # %catch +; CHECK: movq %rdx, 16(%rsp) +; CHECK: pushq %rbp +; CHECK: .seh_pushreg 5 +; CHECK: pushq %rbx +; CHECK: .seh_pushreg 3 +; CHECK: subq $72, %rsp +; CHECK: .seh_stackalloc 72 +; CHECK: leaq 80(%rdx), %rbp +; CHECK: vmovaps %xmm7, 48(%rsp) +; CHECK: .seh_savexmm 7, 48 +; CHECK: vmovaps %xmm6, 32(%rsp) +; CHECK: .seh_savexmm 6, 32 +; CHECK: .seh_endprologue +; CHECK: vmovaps 32(%rsp), %xmm6 +; CHECK: vmovaps 48(%rsp), %xmm7 +; CHECK: leaq .LBB0_3(%rip), %rax +; CHECK: addq $72, %rsp +; CHECK: popq %rbx +; CHECK: popq %rbp +; CHECK: retq # CATCHRET diff --git a/test/CodeGen/X86/x86-interrupt_cc.ll b/test/CodeGen/X86/x86-interrupt_cc.ll index 09f82b46c21..2043816f3a0 100644 --- a/test/CodeGen/X86/x86-interrupt_cc.ll +++ b/test/CodeGen/X86/x86-interrupt_cc.ll @@ -294,7 +294,7 @@ define x86_intrcc void @foo(i8* %frame) { ; CHECK64-SKX-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x84,0x24,0x30,0x08,0x00,0x00] ; CHECK64-SKX-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0xbc,0x24,0xe0,0x07,0x00,0x00] +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x7c,0x24,0x1f] ; CHECK64-SKX-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x74,0x24,0x1e] ; CHECK64-SKX-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill @@ -398,7 +398,7 @@ define x86_intrcc void @foo(i8* %frame) { ; CHECK64-SKX-NEXT: .cfi_offset %xmm28, -448 ; CHECK64-SKX-NEXT: .cfi_offset %xmm29, -384 ; CHECK64-SKX-NEXT: .cfi_offset %xmm30, -320 -; CHECK64-SKX-NEXT: .cfi_offset %xmm31, -224 +; CHECK64-SKX-NEXT: .cfi_offset %xmm31, -256 ; CHECK64-SKX-NEXT: .cfi_offset %k0, -144 ; CHECK64-SKX-NEXT: .cfi_offset %k1, -136 ; CHECK64-SKX-NEXT: .cfi_offset %k2, -128 @@ -474,7 +474,7 @@ define x86_intrcc void @foo(i8* %frame) { ; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x74,0x24,0x1e] ; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0xbc,0x24,0xe0,0x07,0x00,0x00] +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x7c,0x24,0x1f] ; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x84,0x24,0x30,0x08,0x00,0x00] ; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload @@ -635,7 +635,7 @@ define x86_intrcc void @foo(i8* %frame) { ; CHECK32-SKX-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x84,0x24,0x30,0x02,0x00,0x00] ; CHECK32-SKX-NEXT: vmovups %zmm7, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0xbc,0x24,0xe0,0x01,0x00,0x00] +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x7c,0x24,0x07] ; CHECK32-SKX-NEXT: vmovups %zmm6, {{[0-9]+}}(%esp) ## 64-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x74,0x24,0x06] ; CHECK32-SKX-NEXT: vmovups %zmm5, {{[0-9]+}}(%esp) ## 64-byte Spill @@ -661,7 +661,7 @@ define x86_intrcc void @foo(i8* %frame) { ; CHECK32-SKX-NEXT: .cfi_offset %xmm4, -384 ; CHECK32-SKX-NEXT: .cfi_offset %xmm5, -320 ; CHECK32-SKX-NEXT: .cfi_offset %xmm6, -256 -; CHECK32-SKX-NEXT: .cfi_offset %xmm7, -160 +; CHECK32-SKX-NEXT: .cfi_offset %xmm7, -192 ; CHECK32-SKX-NEXT: .cfi_offset %k0, -80 ; CHECK32-SKX-NEXT: .cfi_offset %k1, -72 ; CHECK32-SKX-NEXT: .cfi_offset %k2, -64 @@ -689,7 +689,7 @@ define x86_intrcc void @foo(i8* %frame) { ; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm6 ## 64-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x74,0x24,0x06] ; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm7 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0xbc,0x24,0xe0,0x01,0x00,0x00] +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x7c,0x24,0x07] ; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x84,0x24,0x30,0x02,0x00,0x00] ; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ## 8-byte Reload -- 2.40.0