Store XMM6 and XMM7 in the shadow space in functions that clobbers them.
This way we don't have to adjust the stack pointer as often,
reducing the number of instructions as well as code size.
; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
-cglobal deblock_h_luma, 5,9
+cglobal deblock_h_luma, 5,9,0,0x60+16*WIN64
lea r8, [r1*3]
lea r6, [r0-4]
lea r5, [r0-4+r8]
%if WIN64
- sub rsp, 0x98
- %define pix_tmp rsp+0x30
+ %define pix_tmp rsp+0x30 ; shadow space + r4
%else
- sub rsp, 0x68
%define pix_tmp rsp
%endif
movq m3, [pix_tmp+0x40]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
-%if WIN64
- add rsp, 0x98
-%else
- add rsp, 0x68
-%endif
RET
%endmacro
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra, 4,9
+cglobal deblock_h_luma_intra, 4,9,0,0x80
lea r8, [r1*3]
lea r6, [r0-4]
lea r5, [r0-4+r8]
- sub rsp, 0x88
+%if WIN64
+ %define pix_tmp rsp+0x20 ; shadow space
+%else
%define pix_tmp rsp
+%endif
; transpose 8x16 -> tmp space
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
sub r5, r7
shr r7, 3
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
- add rsp, 0x88
RET
%else
cglobal deblock_h_luma_intra, 2,4,8,0x80
%if ARCH_X86_64 ; too many regs for x86_32
RESET_MM_PERMUTATION
%if WIN64
-%if xmm_regs_used > 6
- %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16
- %assign xmm_regs_used 6
-%endif
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ %assign xmm_regs_used 0
%endif
.mc1dy:
and t2d, 7
mov r6, rsp
and rsp, ~31
- SUB rsp, 0x240
+ sub rsp, 0x240
movu m5, [r0+0*FENC_STRIDE]
movu m6, [r0+4*FENC_STRIDE]
punpcklqdq m5, [r0+2*FENC_STRIDE]
.split:
%if ARCH_X86_64
PROLOGUE 6,9
+ push r3
+ push r2
%if WIN64
movsxd r4, r4d
- sub rsp, 8
+ sub rsp, 40 ; shadow space and alignment
%endif
- push r3
- push r2
mov r2, r1
mov r1, FENC_STRIDE
mov r3, r4
call pixel_sad_%1x%2_cache%3_%5
mov [r8], eax
%if WIN64
- mov r2, [rsp]
+ mov r2, [rsp+40+0*8]
%else
pop r2
%endif
call pixel_sad_%1x%2_cache%3_%5
mov [r8+4], eax
%if WIN64
- mov r2, [rsp+8]
+ mov r2, [rsp+40+1*8]
%else
pop r2
%endif
call pixel_sad_%1x%2_cache%3_%5
mov [r8+8], eax
%if WIN64
- add rsp, 24
+ add rsp, 40+2*8
%endif
RET
%else
push r4
push r3
push r2
+%if WIN64
+ sub rsp, 32 ; shadow space
+%endif
mov r2, r1
mov r1, FENC_STRIDE
mov r3, r5
call pixel_sad_%1x%2_cache%3_%5
mov [r8], eax
%if WIN64
- mov r2, [rsp]
+ mov r2, [rsp+32+0*8]
%else
pop r2
%endif
call pixel_sad_%1x%2_cache%3_%5
mov [r8+4], eax
%if WIN64
- mov r2, [rsp+8]
+ mov r2, [rsp+32+1*8]
%else
pop r2
%endif
call pixel_sad_%1x%2_cache%3_%5
mov [r8+8], eax
%if WIN64
- mov r2, [rsp+16]
+ mov r2, [rsp+32+2*8]
%else
pop r2
%endif
call pixel_sad_%1x%2_cache%3_%5
mov [r8+12], eax
%if WIN64
- add rsp, 24
+ add rsp, 32+3*8
%endif
RET
%else
%if stack_size < 0
%assign stack_size -stack_size
%endif
- %if mmsize != 8
- %assign xmm_regs_used %2
+ %assign stack_size_padded stack_size
+ %if WIN64
+ %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
+ %if mmsize != 8
+ %assign xmm_regs_used %2
+ %if xmm_regs_used > 8
+ %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
+ %endif
+ %endif
%endif
%if mmsize <= 16 && HAVE_ALIGNED_STACK
- %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
- %if xmm_regs_used > 6
- %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
- %endif
+ %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
SUB rsp, stack_size_padded
%else
%assign %%reg_num (regs_used - 1)
; stack in a single instruction (i.e. mov rsp, rstk or mov
; rsp, [rsp+stack_size_padded])
mov rstk, rsp
- %assign stack_size_padded stack_size
- %if xmm_regs_used > 6
- %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
- %if mmsize == 32 && xmm_regs_used & 1
- ; re-align to 32 bytes
- %assign stack_size_padded (stack_size_padded + 16)
- %endif
- %endif
%if %1 < 0 ; need to store rsp on stack
sub rsp, gprsize+stack_size_padded
and rsp, ~(%%stack_alignment-1)
%xdefine rstkm rstk
%endif
%endif
- %if xmm_regs_used > 6
- WIN64_PUSH_XMM
- %endif
+ WIN64_PUSH_XMM
%endif
%endif
%endmacro
%endmacro
%macro WIN64_PUSH_XMM 0
- %assign %%i xmm_regs_used
- %rep (xmm_regs_used-6)
- %assign %%i %%i-1
- movaps [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
- %endrep
+ ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+ %if xmm_regs_used > 6
+ movaps [rstk + stack_offset + 8], xmm6
+ %endif
+ %if xmm_regs_used > 7
+ movaps [rstk + stack_offset + 24], xmm7
+ %endif
+ %if xmm_regs_used > 8
+ %assign %%i 8
+ %rep xmm_regs_used-8
+ movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
%endmacro
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
ASSERT xmm_regs_used <= 16
- %if xmm_regs_used > 6
- SUB rsp, (xmm_regs_used-6)*16+16
- WIN64_PUSH_XMM
+ %if xmm_regs_used > 8
+ %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
+ SUB rsp, stack_size_padded
%endif
+ WIN64_PUSH_XMM
%endmacro
%macro WIN64_RESTORE_XMM_INTERNAL 1
- %if xmm_regs_used > 6
+ %assign %%pad_size 0
+ %if xmm_regs_used > 8
%assign %%i xmm_regs_used
- %rep (xmm_regs_used-6)
+ %rep xmm_regs_used-8
%assign %%i %%i-1
- movaps xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
+ movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
%endrep
- %if stack_size_padded == 0
- add %1, (xmm_regs_used-6)*16+16
- %endif
%endif
%if stack_size_padded > 0
%if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
mov rsp, rstkm
%else
add %1, stack_size_padded
+ %assign %%pad_size stack_size_padded
%endif
%endif
+ %if xmm_regs_used > 7
+ movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+ %endif
+ %if xmm_regs_used > 6
+ movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
+ %endif
%endmacro
%macro WIN64_RESTORE_XMM 1
%endif
align function_align
%2:
- RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
- %xdefine rstk rsp
- %assign stack_offset 0
- %assign stack_size 0
- %assign stack_size_padded 0
- %assign xmm_regs_used 0
+ RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
+ %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+ %assign stack_offset 0 ; stack pointer offset relative to the return address
+ %assign stack_size 0 ; amount of stack space that can be freely used inside a function
+ %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+ %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
%ifnidn %3, ""
PROLOGUE %3
%endif
;* Copyright (C) 2008-2013 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;* Henrik Gramner <hengar-6@student.ltu.se>
+;* Henrik Gramner <henrik@gramner.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
INIT_XMM
-cglobal checkasm_call, 2,15,16
- SUB rsp, max_args*8+16
+cglobal checkasm_call, 2,15,16,max_args*8+8
mov r6, r0
mov [rsp+max_args*8], r1
mov dword [r1], 0
mov rax, r9
.ok:
- ADD rsp, max_args*8+16
RET
%else