From b073e870d135ac27cd97d624330abf0f1fb1ed41 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 6 Dec 2012 15:40:13 -0800 Subject: [PATCH] x86inc: support stack mem allocation and re-alignment in PROLOGUE Use this in 8-bit loopfilter functions so they can be used if there is no aligned stack (e.g. x86-32 MSVC or ICC 10.x). --- common/deblock.c | 16 ++-- common/x86/deblock-a.asm | 27 ++---- common/x86/x86inc.asm | 195 +++++++++++++++++++++++++++++++-------- configure | 5 +- 4 files changed, 176 insertions(+), 67 deletions(-) diff --git a/common/deblock.c b/common/deblock.c index 1f05e831..fd840dc2 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -779,13 +779,13 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2; pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2; pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_sse2; + pf->deblock_luma[1] = x264_deblock_v_luma_sse2; + pf->deblock_luma[0] = x264_deblock_h_luma_sse2; + pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2; + pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2; if( !(cpu&X264_CPU_STACK_MOD4) ) { - pf->deblock_luma[1] = x264_deblock_v_luma_sse2; - pf->deblock_luma[0] = x264_deblock_h_luma_sse2; pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2; - pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2; - pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2; pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2; #if HIGH_BIT_DEPTH @@ -801,13 +801,13 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx; pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx; pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_avx; + pf->deblock_luma[1] = x264_deblock_v_luma_avx; + pf->deblock_luma[0] = x264_deblock_h_luma_avx; + pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx; + pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx; if( !(cpu&X264_CPU_STACK_MOD4) ) { - pf->deblock_luma[1] = x264_deblock_v_luma_avx; - pf->deblock_luma[0] = x264_deblock_h_luma_avx; pf->deblock_chroma[1] = x264_deblock_v_chroma_avx; - pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx; - pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx; pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx; #if HIGH_BIT_DEPTH diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index b4cfa4a4..210761a8 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -1205,14 +1205,12 @@ DEBLOCK_LUMA ;----------------------------------------------------------------------------- ; void deblock_v8_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_%1_luma, 5,5 +cglobal deblock_%1_luma, 5,5,8,2*%2 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 dec r3 ; beta-1 add r4, r0 ; pix-3*stride - %assign pad 2*%2+12-(stack_offset&15) - SUB esp, pad mova m0, [r4+r1] ; p1 mova m1, [r4+2*r1] ; p0 @@ -1251,22 +1249,19 @@ cglobal deblock_%1_luma, 5,5 DEBLOCK_P0_Q0 mova [r4+2*r1], m1 mova [r0], m2 - ADD esp, pad RET ;----------------------------------------------------------------------------- ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX cpuname -cglobal deblock_h_luma, 0,5 +cglobal deblock_h_luma, 0,5,8,0x60+HAVE_ALIGNED_STACK*12 mov r0, r0mp mov r3, r1m lea r4, [r3*3] sub r0, 4 lea r1, [r0+r4] - %assign pad 0x78-(stack_offset&15) - SUB esp, pad -%define pix_tmp esp+12 + %define pix_tmp esp+12*HAVE_ALIGNED_STACK ; transpose 6x16 -> tmp space TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp @@ -1308,7 +1303,6 @@ cglobal deblock_h_luma, 0,5 movq m3, [pix_tmp+0x48] TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) - ADD esp, pad RET %endmacro ; DEBLOCK_LUMA @@ -1439,7 +1433,7 @@ DEBLOCK_LUMA v, 16 %define mpb_0 m14 %define mpb_1 m15 %else - %define spill(x) [esp+16*x+((stack_offset+4)&15)] + %define spill(x) [esp+16*x] %define p2 [r4+r1] %define q2 [r0+2*r1] %define t4 spill(0) @@ -1454,10 +1448,7 @@ DEBLOCK_LUMA v, 16 ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_%1_luma_intra, 4,6,16 -%if ARCH_X86_64 == 0 - sub esp, 0x60 -%endif +cglobal deblock_%1_luma_intra, 4,6,16,ARCH_X86_64*0x50-0x50 lea r4, [r1*4] lea r5, [r1*3] ; 3*stride dec r2d ; alpha-1 @@ -1506,9 +1497,6 @@ cglobal deblock_%1_luma_intra, 4,6,16 LUMA_INTRA_SWAP_PQ LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] .end: -%if ARCH_X86_64 == 0 - add esp, 0x60 -%endif RET INIT_MMX cpuname @@ -1545,12 +1533,10 @@ cglobal deblock_h_luma_intra, 4,9 add rsp, 0x88 RET %else -cglobal deblock_h_luma_intra, 2,4 +cglobal deblock_h_luma_intra, 2,4,8,0x80 lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] -%assign pad 0x8c-(stack_offset&15) - SUB rsp, pad %define pix_tmp rsp ; transpose 8x16 -> tmp space @@ -1581,7 +1567,6 @@ cglobal deblock_h_luma_intra, 2,4 lea r0, [r0+r1*8] lea r2, [r2+r1*8] TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) - ADD rsp, pad RET %endif ; ARCH_X86_64 %endmacro ; DEBLOCK_LUMA_INTRA diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index 23f6f5ff..7888903a 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -105,7 +105,12 @@ CPU amdnop ; %1 = number of arguments. loads them from stack if needed. ; %2 = number of registers used. pushes callee-saved regs if needed. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. -; %4 = list of names to define to registers +; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x, +; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes), +; and an extra register will be allocated to hold the original stack +; pointer (to not invalidate r0m etc.). To prevent the use of an extra +; register as stack pointer, request a negative stack size. +; %4+/%5+ = list of names to define to registers ; PROLOGUE can also be invoked by adding the same options to cglobal ; e.g. @@ -140,11 +145,11 @@ CPU amdnop %define r%1m %2d %define r%1mp %2 %elif ARCH_X86_64 ; memory - %define r%1m [rsp + stack_offset + %3] - %define r%1mp qword r %+ %1m + %define r%1m [rstk + stack_offset + %3] + %define r%1mp qword r %+ %1 %+ m %else - %define r%1m [esp + stack_offset + %3] - %define r%1mp dword r %+ %1m + %define r%1m [rstk + stack_offset + %3] + %define r%1mp dword r %+ %1 %+ m %endif %define r%1 %2 %endmacro @@ -205,12 +210,16 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %macro PUSH 1 push %1 - %assign stack_offset stack_offset+gprsize + %ifidn rstk, rsp + %assign stack_offset stack_offset+gprsize + %endif %endmacro %macro POP 1 pop %1 - %assign stack_offset stack_offset-gprsize + %ifidn rstk, rsp + %assign stack_offset stack_offset-gprsize + %endif %endmacro %macro PUSH_IF_USED 1-* @@ -242,14 +251,14 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %macro SUB 2 sub %1, %2 - %ifidn %1, rsp + %ifidn %1, rstk %assign stack_offset stack_offset+(%2) %endif %endmacro %macro ADD 2 add %1, %2 - %ifidn %1, rsp + %ifidn %1, rstk %assign stack_offset stack_offset-(%2) %endif %endmacro @@ -307,6 +316,79 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %assign n_arg_names %0 %endmacro +%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) + %ifnum %1 + %if %1 != 0 + %assign %%stack_alignment ((mmsize + 15) & ~15) + %assign stack_size %1 + %if stack_size < 0 + %assign stack_size -stack_size + %endif + %if mmsize != 8 + %assign xmm_regs_used %2 + %endif + %if mmsize <= 16 && HAVE_ALIGNED_STACK + %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) + %if xmm_regs_used > 6 + %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 + %endif + SUB rsp, stack_size_padded + %else + %assign %%reg_num (regs_used - 1) + %xdefine rstk r %+ %%reg_num + ; align stack, and save original stack location directly above + ; it, i.e. in [rsp+stack_size_padded], so we can restore the + ; stack in a single instruction (i.e. mov rsp, rstk or mov + ; rsp, [rsp+stack_size_padded]) + mov rstk, rsp + %assign stack_size_padded stack_size + %if xmm_regs_used > 6 + %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 + %if mmsize == 32 && xmm_regs_used & 1 + ; re-align to 32 bytes + %assign stack_size_padded (stack_size_padded + 16) + %endif + %endif + %if %1 < 0 ; need to store rsp on stack + sub rsp, gprsize+stack_size_padded + and rsp, ~(%%stack_alignment-1) + %xdefine rstkm [rsp+stack_size_padded] + mov rstkm, rstk + %else ; can keep rsp in rstk during whole function + sub rsp, stack_size_padded + and rsp, ~(%%stack_alignment-1) + %xdefine rstkm rstk + %endif + %endif + %if xmm_regs_used > 6 + WIN64_PUSH_XMM + %endif + %endif + %endif +%endmacro + +%macro SETUP_STACK_POINTER 1 + %ifnum %1 + %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32) + %if %1 > 0 + %assign regs_used (regs_used + 1) + %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 + %warning "Stack pointer will overwrite register argument" + %endif + %endif + %endif +%endmacro + +%macro DEFINE_ARGS_INTERNAL 3+ + %ifnum %2 + DEFINE_ARGS %3 + %elif %1 == 4 + DEFINE_ARGS %2 + %elif %1 > 4 + DEFINE_ARGS %2, %3 + %endif +%endmacro + %if WIN64 ; Windows x64 ;================================================= DECLARE_REG 0, rcx @@ -325,19 +407,27 @@ DECLARE_REG 12, R13, 104 DECLARE_REG 13, R14, 112 DECLARE_REG 14, R15, 120 -%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 - %if mmsize == 8 - %assign xmm_regs_used 0 - %else + ALLOC_STACK %4, %3 + %if mmsize != 8 && stack_size == 0 WIN64_SPILL_XMM %3 %endif LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS %4 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%macro WIN64_PUSH_XMM 0 + %assign %%i xmm_regs_used + %rep (xmm_regs_used-6) + %assign %%i %%i-1 + movdqa [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i + %endrep %endmacro %macro WIN64_SPILL_XMM 1 @@ -345,11 +435,7 @@ DECLARE_REG 14, R15, 120 ASSERT xmm_regs_used <= 16 %if xmm_regs_used > 6 SUB rsp, (xmm_regs_used-6)*16+16 - %assign %%i xmm_regs_used - %rep (xmm_regs_used-6) - %assign %%i %%i-1 - movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i - %endrep + WIN64_PUSH_XMM %endif %endmacro @@ -358,19 +444,28 @@ DECLARE_REG 14, R15, 120 %assign %%i xmm_regs_used %rep (xmm_regs_used-6) %assign %%i %%i-1 - movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] + movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)] %endrep - add %1, (xmm_regs_used-6)*16+16 + %if stack_size_padded == 0 + add %1, (xmm_regs_used-6)*16+16 + %endif + %endif + %if stack_size_padded > 0 + %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) + mov rsp, rstkm + %else + add %1, stack_size_padded + %endif %endif %endmacro %macro WIN64_RESTORE_XMM 1 WIN64_RESTORE_XMM_INTERNAL %1 - %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 + %assign stack_offset (stack_offset-stack_size_padded) %assign xmm_regs_used 0 %endmacro -%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 +%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 %macro RET 0 WIN64_RESTORE_XMM_INTERNAL rsp @@ -399,19 +494,28 @@ DECLARE_REG 12, R13, 56 DECLARE_REG 13, R14, 64 DECLARE_REG 14, R15, 72 -%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS %4 + DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro -%define has_epilogue regs_used > 9 || mmsize == 32 +%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 %macro RET 0 +%if stack_size_padded > 0 +%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 + mov rsp, rstkm +%else + add rsp, stack_size_padded +%endif +%endif POP_IF_USED 14, 13, 12, 11, 10, 9 %if mmsize == 32 vzeroupper @@ -432,7 +536,7 @@ DECLARE_REG 6, ebp, 28 %macro DECLARE_ARG 1-* %rep %0 - %define r%1m [esp + stack_offset + 4*%1 + 4] + %define r%1m [rstk + stack_offset + 4*%1 + 4] %define r%1mp dword r%1m %rotate 1 %endrep @@ -440,21 +544,34 @@ DECLARE_REG 6, ebp, 28 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 -%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 + ASSERT regs_used >= num_args + %if num_args > 7 + %assign num_args 7 + %endif %if regs_used > 7 %assign regs_used 7 %endif - ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 7 PUSH_IF_USED 3, 4, 5, 6 + ALLOC_STACK %4 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 - DEFINE_ARGS %4 + DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro -%define has_epilogue regs_used > 3 || mmsize == 32 +%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 %macro RET 0 +%if stack_size_padded > 0 +%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 + mov rsp, rstkm +%else + add rsp, stack_size_padded +%endif +%endif POP_IF_USED 6, 5, 4, 3 %if mmsize == 32 vzeroupper @@ -469,6 +586,8 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %endmacro %macro WIN64_RESTORE_XMM 1 %endmacro +%macro WIN64_PUSH_XMM 0 +%endmacro %endif %macro REP_RET 0 @@ -498,12 +617,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 ; Applies any symbol mangling needed for C linkage, and sets up a define such that ; subsequent uses of the function name automatically refer to the mangled version. ; Appends cpuflags to the function name if cpuflags has been specified. -%macro cglobal 1-2+ ; name, [PROLOGUE args] -%if %0 == 1 - cglobal_internal %1 %+ SUFFIX -%else +%macro cglobal 1-2+ "" ; name, [PROLOGUE args] + ; the "" is a workaround for nasm, which fails if SUFFIX is empty + ; and we call cglobal_internal with just %1 %+ SUFFIX (without %2) cglobal_internal %1 %+ SUFFIX, %2 -%endif %endmacro %macro cglobal_internal 1-2+ %ifndef cglobaled_%1 @@ -520,8 +637,12 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 align function_align %1: RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer + %xdefine rstk rsp %assign stack_offset 0 - %if %0 > 1 + %assign stack_size 0 + %assign stack_size_padded 0 + %assign xmm_regs_used 0 + %ifnidn %2, "" PROLOGUE %2 %endif %endmacro diff --git a/configure b/configure index 6b2d616c..94285b27 100755 --- a/configure +++ b/configure @@ -528,6 +528,7 @@ esac LDFLAGS="$LDFLAGS $libm" +aligned_stack=1 case $host_cpu in i*86) ARCH="X86" @@ -548,6 +549,7 @@ case $host_cpu in # < 11 is completely incapable of keeping a mod16 stack if cpp_check "" "" "__INTEL_COMPILER < 1100" ; then define BROKEN_STACK_ALIGNMENT + aligned_stack=0 # 11 <= x < 12 is capable of keeping a mod16 stack, but defaults to not doing so. elif cpp_check "" "" "__INTEL_COMPILER < 1200" ; then CFLAGS="$CFLAGS -falign-stack=assume-16-byte" @@ -555,7 +557,7 @@ case $host_cpu in # >= 12 defaults to a mod16 stack fi # icl on windows has no mod16 stack support - [ $SYS = WINDOWS ] && define BROKEN_STACK_ALIGNMENT + [ $SYS = WINDOWS ] && define BROKEN_STACK_ALIGNMENT && aligned_stack=0 fi if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho -DPREFIX" @@ -648,6 +650,7 @@ case $host_cpu in ARCH="$(echo $host_cpu | tr a-z A-Z)" ;; esac +ASFLAGS="$ASFLAGS -DHAVE_ALIGNED_STACK=${aligned_stack}" if [ $SYS = WINDOWS ]; then if ! rc_check "0 RCDATA {0}" ; then -- 2.40.0