From: Henrik Gramner Date: Sat, 11 May 2013 21:39:09 +0000 (+0200) Subject: x86inc: Utilize the shadow space on 64-bit Windows X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=30c91f62906ce08b5d227002b38ebd64f1291fae;p=libx264 x86inc: Utilize the shadow space on 64-bit Windows Store XMM6 and XMM7 in the shadow space in functions that clobbers them. This way we don't have to adjust the stack pointer as often, reducing the number of instructions as well as code size. --- diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 9692621a..5032733a 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -1138,15 +1138,13 @@ cglobal deblock_v_luma, 5,5,10 ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX cpuname -cglobal deblock_h_luma, 5,9 +cglobal deblock_h_luma, 5,9,0,0x60+16*WIN64 lea r8, [r1*3] lea r6, [r0-4] lea r5, [r0-4+r8] %if WIN64 - sub rsp, 0x98 - %define pix_tmp rsp+0x30 + %define pix_tmp rsp+0x30 ; shadow space + r4 %else - sub rsp, 0x68 %define pix_tmp rsp %endif @@ -1186,11 +1184,6 @@ cglobal deblock_h_luma, 5,9 movq m3, [pix_tmp+0x40] TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) -%if WIN64 - add rsp, 0x98 -%else - add rsp, 0x68 -%endif RET %endmacro @@ -1508,12 +1501,15 @@ INIT_MMX cpuname ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra, 4,9 +cglobal deblock_h_luma_intra, 4,9,0,0x80 lea r8, [r1*3] lea r6, [r0-4] lea r5, [r0-4+r8] - sub rsp, 0x88 +%if WIN64 + %define pix_tmp rsp+0x20 ; shadow space +%else %define pix_tmp rsp +%endif ; transpose 8x16 -> tmp space TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) @@ -1534,7 +1530,6 @@ cglobal deblock_h_luma_intra, 4,9 sub r5, r7 shr r7, 3 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) - add rsp, 0x88 RET %else cglobal deblock_h_luma_intra, 2,4,8,0x80 diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 9f368f4b..bc2854e7 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -1794,10 +1794,9 @@ ALIGN 4 %if ARCH_X86_64 ; too many regs for x86_32 RESET_MM_PERMUTATION %if WIN64 -%if xmm_regs_used > 6 - %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16 - %assign xmm_regs_used 6 -%endif + %assign stack_offset stack_offset - stack_size_padded + %assign stack_size_padded 0 + %assign xmm_regs_used 0 %endif .mc1dy: and t2d, 7 diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 4ee52fd6..748ecf92 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -4422,7 +4422,7 @@ cglobal intra_sad_x9_8x8, 5,7,8 mov r6, rsp and rsp, ~31 - SUB rsp, 0x240 + sub rsp, 0x240 movu m5, [r0+0*FENC_STRIDE] movu m6, [r0+4*FENC_STRIDE] punpcklqdq m5, [r0+2*FENC_STRIDE] diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index 63b7f09d..ff66e087 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -1733,12 +1733,12 @@ cglobal pixel_sad_x3_%1x%2_cache%3_%6 .split: %if ARCH_X86_64 PROLOGUE 6,9 + push r3 + push r2 %if WIN64 movsxd r4, r4d - sub rsp, 8 + sub rsp, 40 ; shadow space and alignment %endif - push r3 - push r2 mov r2, r1 mov r1, FENC_STRIDE mov r3, r4 @@ -1747,7 +1747,7 @@ cglobal pixel_sad_x3_%1x%2_cache%3_%6 call pixel_sad_%1x%2_cache%3_%5 mov [r8], eax %if WIN64 - mov r2, [rsp] + mov r2, [rsp+40+0*8] %else pop r2 %endif @@ -1755,7 +1755,7 @@ cglobal pixel_sad_x3_%1x%2_cache%3_%6 call pixel_sad_%1x%2_cache%3_%5 mov [r8+4], eax %if WIN64 - mov r2, [rsp+8] + mov r2, [rsp+40+1*8] %else pop r2 %endif @@ -1763,7 +1763,7 @@ cglobal pixel_sad_x3_%1x%2_cache%3_%6 call pixel_sad_%1x%2_cache%3_%5 mov [r8+8], eax %if WIN64 - add rsp, 24 + add rsp, 40+2*8 %endif RET %else @@ -1803,6 +1803,9 @@ cglobal pixel_sad_x4_%1x%2_cache%3_%6 push r4 push r3 push r2 +%if WIN64 + sub rsp, 32 ; shadow space +%endif mov r2, r1 mov r1, FENC_STRIDE mov r3, r5 @@ -1810,7 +1813,7 @@ cglobal pixel_sad_x4_%1x%2_cache%3_%6 call pixel_sad_%1x%2_cache%3_%5 mov [r8], eax %if WIN64 - mov r2, [rsp] + mov r2, [rsp+32+0*8] %else pop r2 %endif @@ -1818,7 +1821,7 @@ cglobal pixel_sad_x4_%1x%2_cache%3_%6 call pixel_sad_%1x%2_cache%3_%5 mov [r8+4], eax %if WIN64 - mov r2, [rsp+8] + mov r2, [rsp+32+1*8] %else pop r2 %endif @@ -1826,7 +1829,7 @@ cglobal pixel_sad_x4_%1x%2_cache%3_%6 call pixel_sad_%1x%2_cache%3_%5 mov [r8+8], eax %if WIN64 - mov r2, [rsp+16] + mov r2, [rsp+32+2*8] %else pop r2 %endif @@ -1834,7 +1837,7 @@ cglobal pixel_sad_x4_%1x%2_cache%3_%6 call pixel_sad_%1x%2_cache%3_%5 mov [r8+12], eax %if WIN64 - add rsp, 24 + add rsp, 32+3*8 %endif RET %else diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index 40bb4914..45686828 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -310,14 +310,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %if stack_size < 0 %assign stack_size -stack_size %endif - %if mmsize != 8 - %assign xmm_regs_used %2 + %assign stack_size_padded stack_size + %if WIN64 + %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16 + %endif + %endif %endif %if mmsize <= 16 && HAVE_ALIGNED_STACK - %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) - %if xmm_regs_used > 6 - %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 - %endif + %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) SUB rsp, stack_size_padded %else %assign %%reg_num (regs_used - 1) @@ -327,14 +331,6 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 ; stack in a single instruction (i.e. mov rsp, rstk or mov ; rsp, [rsp+stack_size_padded]) mov rstk, rsp - %assign stack_size_padded stack_size - %if xmm_regs_used > 6 - %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 - %if mmsize == 32 && xmm_regs_used & 1 - ; re-align to 32 bytes - %assign stack_size_padded (stack_size_padded + 16) - %endif - %endif %if %1 < 0 ; need to store rsp on stack sub rsp, gprsize+stack_size_padded and rsp, ~(%%stack_alignment-1) @@ -346,9 +342,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %xdefine rstkm rstk %endif %endif - %if xmm_regs_used > 6 - WIN64_PUSH_XMM - %endif + WIN64_PUSH_XMM %endif %endif %endmacro @@ -409,40 +403,55 @@ DECLARE_REG 14, R15, 120 %endmacro %macro WIN64_PUSH_XMM 0 - %assign %%i xmm_regs_used - %rep (xmm_regs_used-6) - %assign %%i %%i-1 - movaps [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i - %endrep + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + movaps [rstk + stack_offset + 24], xmm7 + %endif + %if xmm_regs_used > 8 + %assign %%i 8 + %rep xmm_regs_used-8 + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif %endmacro %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 6 - SUB rsp, (xmm_regs_used-6)*16+16 - WIN64_PUSH_XMM + %if xmm_regs_used > 8 + %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32 + SUB rsp, stack_size_padded %endif + WIN64_PUSH_XMM %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 1 - %if xmm_regs_used > 6 + %assign %%pad_size 0 + %if xmm_regs_used > 8 %assign %%i xmm_regs_used - %rep (xmm_regs_used-6) + %rep xmm_regs_used-8 %assign %%i %%i-1 - movaps xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)] + movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] %endrep - %if stack_size_padded == 0 - add %1, (xmm_regs_used-6)*16+16 - %endif %endif %if stack_size_padded > 0 %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) mov rsp, rstkm %else add %1, stack_size_padded + %assign %%pad_size stack_size_padded %endif %endif + %if xmm_regs_used > 7 + movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + %endif %endmacro %macro WIN64_RESTORE_XMM 1 @@ -659,12 +668,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %endif align function_align %2: - RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer - %xdefine rstk rsp - %assign stack_offset 0 - %assign stack_size 0 - %assign stack_size_padded 0 - %assign xmm_regs_used 0 + RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer + %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required + %assign stack_offset 0 ; stack pointer offset relative to the return address + %assign stack_size 0 ; amount of stack space that can be freely used inside a function + %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 %ifnidn %3, "" PROLOGUE %3 %endif diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm index de330437..722d746b 100644 --- a/tools/checkasm-a.asm +++ b/tools/checkasm-a.asm @@ -4,7 +4,7 @@ ;* Copyright (C) 2008-2013 x264 project ;* ;* Authors: Loren Merritt -;* Henrik Gramner +;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -88,8 +88,7 @@ cglobal checkasm_stack_clobber, 1,2 ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ) ;----------------------------------------------------------------------------- INIT_XMM -cglobal checkasm_call, 2,15,16 - SUB rsp, max_args*8+16 +cglobal checkasm_call, 2,15,16,max_args*8+8 mov r6, r0 mov [rsp+max_args*8], r1 @@ -158,7 +157,6 @@ cglobal checkasm_call, 2,15,16 mov dword [r1], 0 mov rax, r9 .ok: - ADD rsp, max_args*8+16 RET %else