From: Zvi Rackover Date: Tue, 24 Oct 2017 07:38:29 +0000 (+0000) Subject: X86: Fix X86CallFrameOptimization to search for the COPY StackPointer X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=98150369032943ac04d6f1df4a3ed4b2787b0d01;p=llvm X86: Fix X86CallFrameOptimization to search for the COPY StackPointer SelectionDAG inserts a copy of ESP into a virtual register. X86CallFrameOptimization assumed that the COPY, if present, is always right after the call-frame setup instruction (ADJCALLSTACKDOWN). This was a wrong assumption as the COPY can be located anywhere between the call-frame setup instruction and its first use. If the COPY happened to be located in a different location than what X86CallFrameOptimization assumed, visiting it while processing the call chain would lead to a conservative bail-out. The fix is quite straightfoward, scan ahead for the stack-pointer copy and make note of it so it can be ignored while processing the call chain. Fixes pr34903 Differential Revision: https://reviews.llvm.org/D38730 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@316416 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp index c6022f082c8..32b18b085de 100644 --- a/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/lib/Target/X86/X86CallFrameOptimization.cpp @@ -363,14 +363,23 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, ++I; unsigned StackPtr = RegInfo.getStackRegister(); + auto StackPtrCopyInst = MBB.end(); // SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual - // register here. If it's there, use that virtual register as stack pointer - // instead. - if (I->isCopy() && I->getOperand(0).isReg() && I->getOperand(1).isReg() && - I->getOperand(1).getReg() == StackPtr) { - Context.SPCopy = &*I++; - StackPtr = Context.SPCopy->getOperand(0).getReg(); - } + // register. If it's there, use that virtual register as stack pointer + // instead. Also, we need to locate this instruction so that we can later + // safely ignore it while doing the conservative processing of the call chain. + // The COPY can be located anywhere between the call-frame setup + // instruction and its first use. We use the call instruction as a boundary + // because it is usually cheaper to check if an instruction is a call than + // checking if an instruction uses a register. + for (auto J = I; !J->isCall(); ++J) + if (J->isCopy() && J->getOperand(0).isReg() && J->getOperand(1).isReg() && + J->getOperand(1).getReg() == StackPtr) { + StackPtrCopyInst = J; + Context.SPCopy = &*J++; + StackPtr = Context.SPCopy->getOperand(0).getReg(); + break; + } // Scan the call setup sequence for the pattern we're looking for. // We only handle a simple case - a sequence of store instructions that @@ -379,16 +388,15 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, if (MaxAdjust > 4) Context.MovVector.resize(MaxAdjust, nullptr); - InstClassification Classification; DenseSet UsedRegs; - while ((Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs)) != - Exit) { - if (Classification == Skip) { - ++I; + for (InstClassification Classification = Skip; Classification != Exit; ++I) { + // If this is the COPY of the stack pointer, it's ok to ignore. + if (I == StackPtrCopyInst) + continue; + Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs); + if (Classification != Convert) continue; - } - // We know the instruction has a supported store opcode. // We only want movs of the form: // mov imm/reg, k(%StackPtr) @@ -431,10 +439,10 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, if (RegInfo.isPhysicalRegister(Reg)) UsedRegs.insert(Reg); } - - ++I; } + --I; + // We now expect the end of the sequence. If we stopped early, // or reached the end of the block without finding a call, bail. if (I == MBB.end() || !I->isCall()) diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll index f2b9dee9103..8d289fa9fb0 100644 --- a/test/CodeGen/X86/cmpxchg-clobber-flags.ll +++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll @@ -31,18 +31,44 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) { ; i386-NEXT: sahf ; i386-NEXT: jne +; In the following case we get a long chain of EFLAGS save/restore due to +; a sequence of: +; cmpxchg8b (implicit-def eflags) +; eax = copy eflags +; adjcallstackdown32 +; ... +; use of eax +; During PEI the adjcallstackdown32 is replaced with the subl which +; clobbers eflags, effectively interfering in the liveness interval. +; Is this a case we care about? Maybe no, considering this issue +; happens with the fast pre-regalloc scheduler enforced. A more +; performant scheduler would move the adjcallstackdown32 out of the +; eflags liveness interval. + ; i386f-LABEL: test_intervening_call: ; i386f: cmpxchg8b -; i386f-NEXT: movl %eax, (%esp) -; i386f-NEXT: movl %edx, 4(%esp) -; i386f-NEXT: seto %al +; i386f-NEXT: pushl %eax +; i386f-NEXT: seto %al ; i386f-NEXT: lahf -; i386f-NEXT: movl %eax, [[FLAGS:%.*]] -; i386f-NEXT: calll bar -; i386f-NEXT: movl [[FLAGS]], %eax -; i386f-NEXT: addb $127, %al +; i386f-NEXT: movl %eax, [[FLAGS:%.*]] +; i386f-NEXT: popl %eax +; i386f-NEXT: subl $8, %esp +; i386f-NEXT: pushl %eax +; i386f-NEXT: movl %ecx, %eax +; i386f-NEXT: addb $127, %al ; i386f-NEXT: sahf -; i386f-NEXT: jne +; i386f-NEXT: popl %eax +; i386f-NEXT: pushl %eax +; i386f-NEXT: seto %al +; i386f-NEXT: lahf +; i386f-NEXT: movl %eax, %esi +; i386f-NEXT: popl %eax +; i386f-NEXT: pushl %edx +; i386f-NEXT: pushl %eax +; i386f-NEXT: calll bar +; i386f-NEXT: addl $16, %esp +; i386f-NEXT: movl %esi, %eax +; i386f-NEXT: addb $127, %al ; x8664-LABEL: test_intervening_call: ; x8664: cmpxchgq diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll index d715ccfa8c6..ad34381f43e 100644 --- a/test/CodeGen/X86/movtopush.ll +++ b/test/CodeGen/X86/movtopush.ll @@ -228,16 +228,16 @@ entry: ; NORMAL-NEXT: pushl $2 ; NORMAL-NEXT: pushl $1 ; NORMAL-NEXT: call -; NORMAL-NEXT: subl $4, %esp -; NORMAL-NEXT: movl 20(%esp), [[E1:%e..]] -; NORMAL-NEXT: movl 24(%esp), [[E2:%e..]] -; NORMAL-NEXT: movl [[E2]], 4(%esp) -; NORMAL-NEXT: movl [[E1]], (%esp) -; NORMAL-NEXT: leal 32(%esp), [[E3:%e..]] -; NORMAL-NEXT: movl [[E3]], 16(%esp) -; NORMAL-NEXT: leal 28(%esp), [[E4:%e..]] -; NORMAL-NEXT: movl [[E4]], 12(%esp) -; NORMAL-NEXT: movl $6, 8(%esp) +; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: movl (%esp), [[E1:%e..]] +; NORMAL-NEXT: movl 4(%esp), [[E2:%e..]] +; NORMAL-NEXT: leal 16(%esp), [[E3:%e..]] +; NORMAL-NEXT: leal 12(%esp), [[E4:%e..]] +; NORMAL-NEXT: pushl [[E3]] +; NORMAL-NEXT: pushl [[E4]] +; NORMAL-NEXT: pushl $6 +; NORMAL-NEXT: pushl [[E2]] +; NORMAL-NEXT: pushl [[E1]] ; NORMAL-NEXT: call ; NORMAL-NEXT: addl $20, %esp define void @test9() optsize { @@ -297,10 +297,10 @@ define void @test11() optsize { ; Converting one mov into a push isn't worth it when ; doing so forces too much overhead for other calls. ; NORMAL-LABEL: test12: -; NORMAL: movl $8, 12(%esp) -; NORMAL-NEXT: movl $7, 8(%esp) -; NORMAL-NEXT: movl $6, 4(%esp) -; NORMAL-NEXT: movl $5, (%esp) +; NORMAL: pushl $8 +; NORMAL-NEXT: pushl $7 +; NORMAL-NEXT: pushl $6 +; NORMAL-NEXT: pushl $5 ; NORMAL-NEXT: calll _good define void @test12() optsize { entry: @@ -318,18 +318,22 @@ entry: ; NORMAL-NEXT: pushl $2 ; NORMAL-NEXT: pushl $1 ; NORMAL-NEXT: calll _good -; NORMAL-NEXT: subl $4, %esp -; NORMAL: movl $8, 16(%esp) -; NORMAL-NEXT: movl $7, 12(%esp) -; NORMAL-NEXT: movl $6, 8(%esp) -; NORMAL-NEXT: calll _struct -; NORMAL-NEXT: addl $20, %esp -; NORMAL-NEXT: pushl $12 -; NORMAL-NEXT: pushl $11 -; NORMAL-NEXT: pushl $10 -; NORMAL-NEXT: pushl $9 -; NORMAL-NEXT: calll _good -; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: addl $16, %esp +; NORMAL=NEXT: movl (%esp), %eax +; NORMAL=NEXT: movl 4(%esp), %ecx +; NORMAL=NEXT: pushl $8 +; NORMAL=NEXT: pushl $7 +; NORMAL=NEXT: pushl $6 +; NORMAL=NEXT: pushl %ecx +; NORMAL=NEXT: pushl %eax +; NORMAL=NEXT: calll _struct +; NORMAL=NEXT: addl $20, %esp +; NORMAL=NEXT: pushl $12 +; NORMAL=NEXT: pushl $11 +; NORMAL=NEXT: pushl $10 +; NORMAL=NEXT: pushl $9 +; NORMAL=NEXT: calll _good +; NORMAL=NEXT: addl $16, %esp define void @test12b() optsize { entry: %s = alloca %struct.s, align 4 diff --git a/test/CodeGen/X86/movtopush.mir b/test/CodeGen/X86/movtopush.mir index 82441c7134f..42856ea1690 100644 --- a/test/CodeGen/X86/movtopush.mir +++ b/test/CodeGen/X86/movtopush.mir @@ -1,5 +1,6 @@ # RUN: llc -mtriple=i686-windows --run-pass="x86-cf-opt" %s -o - | FileCheck %s +# PR34903 --- | target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" target triple = "i686--windows-msvc" @@ -39,17 +40,16 @@ # CHECK-NEXT: PUSH32i8 1, implicit-def %esp, implicit %esp # CHECK-NEXT: CALLpcrel32 @good, csr_32, implicit %esp, implicit-def %esp # CHECK-NEXT: ADJCALLSTACKUP32 16, 0, implicit-def dead %esp, implicit-def dead %eflags, implicit %esp -# CHECK-NEXT: ADJCALLSTACKDOWN32 20, 0, 0, implicit-def dead %esp, implicit-def dead %eflags, implicit %esp +# CHECK-NEXT: ADJCALLSTACKDOWN32 20, 0, 20, implicit-def dead %esp, implicit-def dead %eflags, implicit %esp # CHECK-NEXT: %1 = MOV32rm %stack.2.s, 1, _, 0, _ :: (load 4 from %stack.2.s, align 8) # CHECK-NEXT: %2 = MOV32rm %stack.2.s, 1, _, 4, _ :: (load 4 from %stack.2.s + 4) -# CHECK-NEXT: %3 = COPY %esp -# CHECK-NEXT: MOV32mr %3, 1, _, 4, _, killed %2 :: (store 4) -# CHECK-NEXT: MOV32mr %3, 1, _, 0, _, killed %1 :: (store 4) # CHECK-NEXT: %4 = LEA32r %stack.0.p, 1, _, 0, _ -# CHECK-NEXT: MOV32mr %3, 1, _, 16, _, killed %4 :: (store 4 into stack + 16) # CHECK-NEXT: %5 = LEA32r %stack.1.q, 1, _, 0, _ -# CHECK-NEXT: MOV32mr %3, 1, _, 12, _, killed %5 :: (store 4 into stack + 12) -# CHECK-NEXT: MOV32mi %3, 1, _, 8, _, 6 :: (store 4 into stack + 8) +# CHECK-NEXT: PUSH32r %4, implicit-def %esp, implicit %esp +# CHECK-NEXT: PUSH32r %5, implicit-def %esp, implicit %esp +# CHECK-NEXT: PUSH32i8 6, implicit-def %esp, implicit %esp +# CHECK-NEXT: PUSH32r %2, implicit-def %esp, implicit %esp +# CHECK-NEXT: PUSH32r %1, implicit-def %esp, implicit %esp # CHECK-NEXT: CALLpcrel32 @struct, csr_32, implicit %esp, implicit-def %esp # CHECK-NEXT: ADJCALLSTACKUP32 20, 0, implicit-def dead %esp, implicit-def dead %eflags, implicit %esp # CHECK-NEXT: RET 0