;-----------------------------------------------------------------------------
; void deblock_v8_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_%1_luma, 5,5
+cglobal deblock_%1_luma, 5,5,8,2*%2
lea r4, [r1*3]
dec r2 ; alpha-1
neg r4
dec r3 ; beta-1
add r4, r0 ; pix-3*stride
- %assign pad 2*%2+12-(stack_offset&15)
- SUB esp, pad
mova m0, [r4+r1] ; p1
mova m1, [r4+2*r1] ; p0
DEBLOCK_P0_Q0
mova [r4+2*r1], m1
mova [r0], m2
- ADD esp, pad
RET
;-----------------------------------------------------------------------------
; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
-cglobal deblock_h_luma, 0,5
+cglobal deblock_h_luma, 0,5,8,0x60+HAVE_ALIGNED_STACK*12
mov r0, r0mp
mov r3, r1m
lea r4, [r3*3]
sub r0, 4
lea r1, [r0+r4]
- %assign pad 0x78-(stack_offset&15)
- SUB esp, pad
-%define pix_tmp esp+12
+ %define pix_tmp esp+12*HAVE_ALIGNED_STACK
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
movq m3, [pix_tmp+0x48]
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
- ADD esp, pad
RET
%endmacro ; DEBLOCK_LUMA
%define mpb_0 m14
%define mpb_1 m15
%else
- %define spill(x) [esp+16*x+((stack_offset+4)&15)]
+ %define spill(x) [esp+16*x]
%define p2 [r4+r1]
%define q2 [r0+2*r1]
%define t4 spill(0)
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_%1_luma_intra, 4,6,16
-%if ARCH_X86_64 == 0
- sub esp, 0x60
-%endif
+cglobal deblock_%1_luma_intra, 4,6,16,ARCH_X86_64*0x50-0x50
lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride
dec r2d ; alpha-1
LUMA_INTRA_SWAP_PQ
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
.end:
-%if ARCH_X86_64 == 0
- add esp, 0x60
-%endif
RET
INIT_MMX cpuname
add rsp, 0x88
RET
%else
-cglobal deblock_h_luma_intra, 2,4
+cglobal deblock_h_luma_intra, 2,4,8,0x80
lea r3, [r1*3]
sub r0, 4
lea r2, [r0+r3]
-%assign pad 0x8c-(stack_offset&15)
- SUB rsp, pad
%define pix_tmp rsp
; transpose 8x16 -> tmp space
lea r0, [r0+r1*8]
lea r2, [r2+r1*8]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
- ADD rsp, pad
RET
%endif ; ARCH_X86_64
%endmacro ; DEBLOCK_LUMA_INTRA
; %1 = number of arguments. loads them from stack if needed.
; %2 = number of registers used. pushes callee-saved regs if needed.
; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
-; %4 = list of names to define to registers
+; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
+; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
+; and an extra register will be allocated to hold the original stack
+; pointer (to not invalidate r0m etc.). To prevent the use of an extra
+; register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
; PROLOGUE can also be invoked by adding the same options to cglobal
; e.g.
%define r%1m %2d
%define r%1mp %2
%elif ARCH_X86_64 ; memory
- %define r%1m [rsp + stack_offset + %3]
- %define r%1mp qword r %+ %1m
+ %define r%1m [rstk + stack_offset + %3]
+ %define r%1mp qword r %+ %1 %+ m
%else
- %define r%1m [esp + stack_offset + %3]
- %define r%1mp dword r %+ %1m
+ %define r%1m [rstk + stack_offset + %3]
+ %define r%1mp dword r %+ %1 %+ m
%endif
%define r%1 %2
%endmacro
%macro PUSH 1
push %1
- %assign stack_offset stack_offset+gprsize
+ %ifidn rstk, rsp
+ %assign stack_offset stack_offset+gprsize
+ %endif
%endmacro
%macro POP 1
pop %1
- %assign stack_offset stack_offset-gprsize
+ %ifidn rstk, rsp
+ %assign stack_offset stack_offset-gprsize
+ %endif
%endmacro
%macro PUSH_IF_USED 1-*
%macro SUB 2
sub %1, %2
- %ifidn %1, rsp
+ %ifidn %1, rstk
%assign stack_offset stack_offset+(%2)
%endif
%endmacro
%macro ADD 2
add %1, %2
- %ifidn %1, rsp
+ %ifidn %1, rstk
%assign stack_offset stack_offset-(%2)
%endif
%endmacro
%assign n_arg_names %0
%endmacro
+%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
+ %ifnum %1
+ %if %1 != 0
+ %assign %%stack_alignment ((mmsize + 15) & ~15)
+ %assign stack_size %1
+ %if stack_size < 0
+ %assign stack_size -stack_size
+ %endif
+ %if mmsize != 8
+ %assign xmm_regs_used %2
+ %endif
+ %if mmsize <= 16 && HAVE_ALIGNED_STACK
+ %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
+ %if xmm_regs_used > 6
+ %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
+ %endif
+ SUB rsp, stack_size_padded
+ %else
+ %assign %%reg_num (regs_used - 1)
+ %xdefine rstk r %+ %%reg_num
+ ; align stack, and save original stack location directly above
+ ; it, i.e. in [rsp+stack_size_padded], so we can restore the
+ ; stack in a single instruction (i.e. mov rsp, rstk or mov
+ ; rsp, [rsp+stack_size_padded])
+ mov rstk, rsp
+ %assign stack_size_padded stack_size
+ %if xmm_regs_used > 6
+ %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
+ %if mmsize == 32 && xmm_regs_used & 1
+ ; re-align to 32 bytes
+ %assign stack_size_padded (stack_size_padded + 16)
+ %endif
+ %endif
+ %if %1 < 0 ; need to store rsp on stack
+ sub rsp, gprsize+stack_size_padded
+ and rsp, ~(%%stack_alignment-1)
+ %xdefine rstkm [rsp+stack_size_padded]
+ mov rstkm, rstk
+ %else ; can keep rsp in rstk during whole function
+ sub rsp, stack_size_padded
+ and rsp, ~(%%stack_alignment-1)
+ %xdefine rstkm rstk
+ %endif
+ %endif
+ %if xmm_regs_used > 6
+ WIN64_PUSH_XMM
+ %endif
+ %endif
+ %endif
+%endmacro
+
+%macro SETUP_STACK_POINTER 1
+ %ifnum %1
+ %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
+ %if %1 > 0
+ %assign regs_used (regs_used + 1)
+ %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
+ %warning "Stack pointer will overwrite register argument"
+ %endif
+ %endif
+ %endif
+%endmacro
+
+%macro DEFINE_ARGS_INTERNAL 3+
+ %ifnum %2
+ DEFINE_ARGS %3
+ %elif %1 == 4
+ DEFINE_ARGS %2
+ %elif %1 > 4
+ DEFINE_ARGS %2, %3
+ %endif
+%endmacro
+
%if WIN64 ; Windows x64 ;=================================================
DECLARE_REG 0, rcx
DECLARE_REG 13, R14, 112
DECLARE_REG 14, R15, 120
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
ASSERT regs_used >= num_args
+ SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
- %if mmsize == 8
- %assign xmm_regs_used 0
- %else
+ ALLOC_STACK %4, %3
+ %if mmsize != 8 && stack_size == 0
WIN64_SPILL_XMM %3
%endif
LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
- DEFINE_ARGS %4
+ DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+ %assign %%i xmm_regs_used
+ %rep (xmm_regs_used-6)
+ %assign %%i %%i-1
+ movdqa [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
+ %endrep
%endmacro
%macro WIN64_SPILL_XMM 1
ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 6
SUB rsp, (xmm_regs_used-6)*16+16
- %assign %%i xmm_regs_used
- %rep (xmm_regs_used-6)
- %assign %%i %%i-1
- movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
- %endrep
+ WIN64_PUSH_XMM
%endif
%endmacro
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
- movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
+ movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
%endrep
- add %1, (xmm_regs_used-6)*16+16
+ %if stack_size_padded == 0
+ add %1, (xmm_regs_used-6)*16+16
+ %endif
+ %endif
+ %if stack_size_padded > 0
+ %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
+ mov rsp, rstkm
+ %else
+ add %1, stack_size_padded
+ %endif
%endif
%endmacro
%macro WIN64_RESTORE_XMM 1
WIN64_RESTORE_XMM_INTERNAL %1
- %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+ %assign stack_offset (stack_offset-stack_size_padded)
%assign xmm_regs_used 0
%endmacro
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL rsp
DECLARE_REG 13, R14, 64
DECLARE_REG 14, R15, 72
-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
ASSERT regs_used >= num_args
+ SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
PUSH_IF_USED 9, 10, 11, 12, 13, 14
+ ALLOC_STACK %4
LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
- DEFINE_ARGS %4
+ DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
-%define has_epilogue regs_used > 9 || mmsize == 32
+%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
%macro RET 0
+%if stack_size_padded > 0
+%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
+ mov rsp, rstkm
+%else
+ add rsp, stack_size_padded
+%endif
+%endif
POP_IF_USED 14, 13, 12, 11, 10, 9
%if mmsize == 32
vzeroupper
%macro DECLARE_ARG 1-*
%rep %0
- %define r%1m [esp + stack_offset + 4*%1 + 4]
+ %define r%1m [rstk + stack_offset + 4*%1 + 4]
%define r%1mp dword r%1m
%rotate 1
%endrep
DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
+ ASSERT regs_used >= num_args
+ %if num_args > 7
+ %assign num_args 7
+ %endif
%if regs_used > 7
%assign regs_used 7
%endif
- ASSERT regs_used >= num_args
+ SETUP_STACK_POINTER %4
+ ASSERT regs_used <= 7
PUSH_IF_USED 3, 4, 5, 6
+ ALLOC_STACK %4
LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
- DEFINE_ARGS %4
+ DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
-%define has_epilogue regs_used > 3 || mmsize == 32
+%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
%macro RET 0
+%if stack_size_padded > 0
+%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
+ mov rsp, rstkm
+%else
+ add rsp, stack_size_padded
+%endif
+%endif
POP_IF_USED 6, 5, 4, 3
%if mmsize == 32
vzeroupper
%endmacro
%macro WIN64_RESTORE_XMM 1
%endmacro
+%macro WIN64_PUSH_XMM 0
+%endmacro
%endif
%macro REP_RET 0
; Applies any symbol mangling needed for C linkage, and sets up a define such that
; subsequent uses of the function name automatically refer to the mangled version.
; Appends cpuflags to the function name if cpuflags has been specified.
-%macro cglobal 1-2+ ; name, [PROLOGUE args]
-%if %0 == 1
- cglobal_internal %1 %+ SUFFIX
-%else
+%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
+ ; the "" is a workaround for nasm, which fails if SUFFIX is empty
+ ; and we call cglobal_internal with just %1 %+ SUFFIX (without %2)
cglobal_internal %1 %+ SUFFIX, %2
-%endif
%endmacro
%macro cglobal_internal 1-2+
%ifndef cglobaled_%1
align function_align
%1:
RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+ %xdefine rstk rsp
%assign stack_offset 0
- %if %0 > 1
+ %assign stack_size 0
+ %assign stack_size_padded 0
+ %assign xmm_regs_used 0
+ %ifnidn %2, ""
PROLOGUE %2
%endif
%endmacro