From: Michael Kuperstein Date: Wed, 23 Nov 2016 18:33:49 +0000 (+0000) Subject: [X86] Allow folding of stack reloads when loading a subreg of the spilled reg X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4f54847874d8eeff4e4b63c6bf21f799a233d96e;p=llvm [X86] Allow folding of stack reloads when loading a subreg of the spilled reg We did not support subregs in InlineSpiller:foldMemoryOperand() because targets may not deal with them correctly. This adds a target hook to let the spiller know that a target can handle subregs, and actually enables it for x86 for the case of stack slot reloads. This fixes PR30832. Differential Revision: https://reviews.llvm.org/D26521 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@287792 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h index 41804f82d3c..fec51a3d033 100644 --- a/include/llvm/Target/TargetInstrInfo.h +++ b/include/llvm/Target/TargetInstrInfo.h @@ -817,6 +817,20 @@ public: /// anything was changed. virtual bool expandPostRAPseudo(MachineInstr &MI) const { return false; } + /// Check whether the target can fold a load that feeds a subreg operand + /// (or a subreg operand that feeds a store). + /// For example, X86 may want to return true if it can fold + /// movl (%esp), %eax + /// subb, %al, ... + /// Into: + /// subb (%esp), ... + /// + /// Ideally, we'd like the target implementation of foldMemoryOperand() to + /// reject subregs - but since this behavior used to be enforced in the + /// target-independent code, moving this responsibility to the targets + /// has the potential of causing nasty silent breakage in out-of-tree targets. + virtual bool isSubregFoldable() const { return false; } + /// Attempt to fold a load or store of the specified stack /// slot into the specified machine instruction for the specified operand(s). /// If this is possible, a new instruction is returned with the specified diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp index 3ccc18d120f..3e5ae5f5f07 100644 --- a/lib/CodeGen/InlineSpiller.cpp +++ b/lib/CodeGen/InlineSpiller.cpp @@ -739,9 +739,12 @@ foldMemoryOperand(ArrayRef > Ops, bool WasCopy = MI->isCopy(); unsigned ImpReg = 0; - bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::STATEPOINT || - MI->getOpcode() == TargetOpcode::PATCHPOINT || - MI->getOpcode() == TargetOpcode::STACKMAP); + // Spill subregs if the target allows it. + // We always want to spill subregs for stackmap/patchpoint pseudos. + bool SpillSubRegs = TII.isSubregFoldable() || + MI->getOpcode() == TargetOpcode::STATEPOINT || + MI->getOpcode() == TargetOpcode::PATCHPOINT || + MI->getOpcode() == TargetOpcode::STACKMAP; // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied // operands. @@ -754,7 +757,7 @@ foldMemoryOperand(ArrayRef > Ops, ImpReg = MO.getReg(); continue; } - // FIXME: Teach targets to deal with subregs. + if (!SpillSubRegs && MO.getSubReg()) return false; // We cannot fold a load instruction into a def. diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp index 265b4bfaff1..01f91b96b58 100644 --- a/lib/CodeGen/TargetInstrInfo.cpp +++ b/lib/CodeGen/TargetInstrInfo.cpp @@ -515,6 +515,31 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, assert(MBB && "foldMemoryOperand needs an inserted instruction"); MachineFunction &MF = *MBB->getParent(); + // If we're not folding a load into a subreg, the size of the load is the + // size of the spill slot. But if we are, we need to figure out what the + // actual load size is. + int64_t MemSize = 0; + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + + if (Flags & MachineMemOperand::MOStore) { + MemSize = MFI.getObjectSize(FI); + } else { + for (unsigned Idx : Ops) { + int64_t OpSize = MFI.getObjectSize(FI); + + if (auto SubReg = MI.getOperand(Idx).getSubReg()) { + unsigned SubRegSize = TRI->getSubRegIdxSize(SubReg); + if (SubRegSize > 0 && !(SubRegSize % 8)) + OpSize = SubRegSize / 8; + } + + MemSize = std::max(MemSize, OpSize); + } + } + + assert(MemSize && "Did not expect a zero-sized stack slot"); + MachineInstr *NewMI = nullptr; if (MI.getOpcode() == TargetOpcode::STACKMAP || @@ -538,10 +563,9 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, assert((!(Flags & MachineMemOperand::MOLoad) || NewMI->mayLoad()) && "Folded a use to a non-load!"); - const MachineFrameInfo &MFI = MF.getFrameInfo(); assert(MFI.getObjectOffset(FI) != -1); MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo::getFixedStack(MF, FI), Flags, MFI.getObjectSize(FI), + MachinePointerInfo::getFixedStack(MF, FI), Flags, MemSize, MFI.getObjectAlignment(FI)); NewMI->addMemOperand(MF, MMO); @@ -558,7 +582,6 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, const MachineOperand &MO = MI.getOperand(1 - Ops[0]); MachineBasicBlock::iterator Pos = MI; - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (Flags == MachineMemOperand::MOStore) storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI); diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 967857c4c5f..0a85a9902c0 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -6843,6 +6843,14 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode())) return nullptr; + // Don't fold subreg spills, or reloads that use a high subreg. + for (auto Op : Ops) { + MachineOperand &MO = MI.getOperand(Op); + auto SubReg = MO.getSubReg(); + if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi)) + return nullptr; + } + const MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned Size = MFI.getObjectSize(FrameIndex); unsigned Alignment = MFI.getObjectAlignment(FrameIndex); @@ -6967,6 +6975,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, LiveIntervals *LIS) const { + + // TODO: Support the case where LoadMI loads a wide register, but MI + // only uses a subreg. + for (auto Op : Ops) { + if (MI.getOperand(Op).getSubReg()) + return nullptr; + } + // If loading from a FrameIndex, fold directly from the FrameIndex. unsigned NumOps = LoadMI.getDesc().getNumOperands(); int FrameIndex; diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index bc43820a024..a0292bbdaed 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -378,6 +378,10 @@ public: bool expandPostRAPseudo(MachineInstr &MI) const override; + /// Check whether the target can fold a load that feeds a subreg operand + /// (or a subreg operand that feeds a store). + bool isSubregFoldable() const override { return true; } + /// foldMemoryOperand - If this target supports it, fold a load or store of /// the specified stack slot into the specified machine instruction for the /// specified operand(s). If this is possible, the target should perform the diff --git a/test/CodeGen/X86/partial-fold32.ll b/test/CodeGen/X86/partial-fold32.ll index ba3f73ba90f..7fc1ed3521e 100644 --- a/test/CodeGen/X86/partial-fold32.ll +++ b/test/CodeGen/X86/partial-fold32.ll @@ -3,8 +3,7 @@ define fastcc i8 @fold32to8(i32 %add, i8 %spill) { ; CHECK-LABEL: fold32to8: ; CHECK: movl %ecx, (%esp) # 4-byte Spill -; CHECK: movl (%esp), %eax # 4-byte Reload -; CHECK: subb %al, %dl +; CHECK: subb (%esp), %dl # 1-byte Folded Reload entry: tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() %trunc = trunc i32 %add to i8 diff --git a/test/CodeGen/X86/partial-fold64.ll b/test/CodeGen/X86/partial-fold64.ll index b9ea7d6773a..15c9d194be4 100644 --- a/test/CodeGen/X86/partial-fold64.ll +++ b/test/CodeGen/X86/partial-fold64.ll @@ -3,8 +3,7 @@ define i32 @fold64to32(i64 %add, i32 %spill) { ; CHECK-LABEL: fold64to32: ; CHECK: movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill -; CHECK: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload -; CHECK: subl %eax, %esi +; CHECK: subl -{{[0-9]+}}(%rsp), %esi # 4-byte Folded Reload entry: tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"() %trunc = trunc i64 %add to i32 @@ -15,8 +14,7 @@ entry: define i8 @fold64to8(i64 %add, i8 %spill) { ; CHECK-LABEL: fold64to8: ; CHECK: movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill -; CHECK: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload -; CHECK: subb %al, %sil +; CHECK: subb -{{[0-9]+}}(%rsp), %sil # 1-byte Folded Reload entry: tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"() %trunc = trunc i64 %add to i8 diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll index 0453dc18d01..78522948f0a 100644 --- a/test/CodeGen/X86/vector-half-conversions.ll +++ b/test/CodeGen/X86/vector-half-conversions.ll @@ -4788,9 +4788,8 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { ; AVX1-NEXT: orl %ebx, %r14d ; AVX1-NEXT: shlq $32, %r14 ; AVX1-NEXT: orq %r15, %r14 -; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movw %ax, %bx ; AVX1-NEXT: shll $16, %ebx @@ -4856,9 +4855,8 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { ; AVX2-NEXT: orl %ebx, %r14d ; AVX2-NEXT: shlq $32, %r14 ; AVX2-NEXT: orq %r15, %r14 -; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movw %ax, %bx ; AVX2-NEXT: shll $16, %ebx @@ -5585,9 +5583,8 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind { ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r12d ; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload @@ -5654,9 +5651,8 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %r12d ; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload