From dc454eab263d463d2eeecf627aae31a10a5d080c Mon Sep 17 00:00:00 2001 From: Sam Hocevar Date: Wed, 8 Feb 2006 09:26:56 +0000 Subject: [PATCH] * Additional fixes to the PIC versions of assembly routines. They now pass all checkasm tests and output streams are bit-by-bit identical, which sounds good. git-svn-id: svn://svn.videolan.org/x264/trunk@422 df754926-b1dd-0310-bc7b-ec298dee348c --- common/amd64/amd64inc.asm | 8 ++++ common/i386/dct-a.asm | 33 ++++++++------- common/i386/deblock-a.asm | 32 +++++++-------- common/i386/i386inc.asm | 55 ++++++++++++++++++++----- common/i386/mc-a.asm | 50 +++++++++++------------ common/i386/mc-a2.asm | 65 +++++++++++++++-------------- common/i386/pixel-sse2.asm | 2 +- common/i386/predict-a.asm | 84 +++++++++++++++++++------------------- common/i386/quant-a.asm | 12 +++--- 9 files changed, 192 insertions(+), 149 deletions(-) diff --git a/common/amd64/amd64inc.asm b/common/amd64/amd64inc.asm index 21d0e436..ab99dea8 100644 --- a/common/amd64/amd64inc.asm +++ b/common/amd64/amd64inc.asm @@ -258,6 +258,14 @@ SECTION .text %endif ;linux +; PIC support macros. On x86_64 we just use RIP-relative addressing, which is +; much simpler than the GOT handling we need to perform on x86. +; +; - GLOBAL should be used as a suffix for global addressing, eg. +; mov eax, [foo GLOBAL] +; instead of +; mov eax, [foo] +; %ifdef __PIC__ %define GLOBAL wrt rip %else diff --git a/common/i386/dct-a.asm b/common/i386/dct-a.asm index c3c20722..3b246c8e 100644 --- a/common/i386/dct-a.asm +++ b/common/i386/dct-a.asm @@ -154,14 +154,15 @@ ALIGN 16 ; void __cdecl dct4x4dc( int16_t d[4][4] ) ;----------------------------------------------------------------------------- x264_dct4x4dc_mmxext: - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC mov eax, [esp+ 4] movq mm0, [eax+ 0] movq mm1, [eax+ 8] movq mm2, [eax+16] movq mm3, [eax+24] + picpush ebx + picgetgot ebx + MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 @@ -185,7 +186,7 @@ x264_dct4x4dc_mmxext: movq [eax+16], mm1 psraw mm3, 1 movq [eax+24], mm3 - POP_EBX_IF_PIC + picpop ebx ret cglobal x264_idct4x4dc_mmxext @@ -272,9 +273,6 @@ ALIGN 16 ; void __cdecl x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] ) ;----------------------------------------------------------------------------- x264_add4x4_idct_mmxext: - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC - ; Load dct coeffs mov eax, [esp+12] ; dct movq mm0, [eax+ 0] @@ -286,6 +284,9 @@ x264_add4x4_idct_mmxext: mov ecx, [esp+ 8] ; i_dst lea edx, [ecx+ecx*2] + picpush ebx + picgetgot ebx + ; out:mm0, mm1, mm2, mm3 MMX_TRANSPOSE mm0, mm4, mm3, mm1, mm2 @@ -310,7 +311,7 @@ x264_add4x4_idct_mmxext: MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [eax+ecx*2] MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [eax+edx] - POP_EBX_IF_PIC + picpop ebx ret @@ -395,10 +396,11 @@ ALIGN 16 ; void __cdecl x264_xdct8_mmxext( int16_t dest[8][8] ); ;----------------------------------------------------------------------------- x264_xdct8_mmxext: - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC - mov eax, [esp+04] ; dest + + picpush ebx + picgetgot ebx + movq mm5, [x264_mmx_PPNN GLOBAL] movq mm6, [x264_mmx_PNNP GLOBAL] movq mm4, [x264_mmx_PPPN GLOBAL] @@ -458,7 +460,7 @@ x264_xdct8_mmxext: %assign disp disp+16 %endrep - POP_EBX_IF_PIC + picpop ebx ret ALIGN 16 @@ -551,10 +553,11 @@ ALIGN 16 ; void __cdecl x264_xidct8_mmxext( int16_t dest[8][8] ); ;----------------------------------------------------------------------------- x264_xidct8_mmxext: - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC - mov eax, [esp+04] ; dest + + picpush ebx + picgetgot ebx + movq mm4, [x264_mmx_PPNN GLOBAL] movq mm5, [x264_mmx_PNPN GLOBAL] movq mm6, [x264_mmx_PPNP GLOBAL] @@ -609,7 +612,7 @@ x264_xidct8_mmxext: %assign disp disp+16 %endrep - POP_EBX_IF_PIC + picpop ebx ret ALIGN 16 diff --git a/common/i386/deblock-a.asm b/common/i386/deblock-a.asm index e05de4c7..0085af0b 100644 --- a/common/i386/deblock-a.asm +++ b/common/i386/deblock-a.asm @@ -247,14 +247,14 @@ ALIGN 16 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- x264_deblock_v8_luma_mmxext: - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC + picpush ebx + picgetgot ebx push edi push esi - mov edi, [esp+12] ; pix - mov esi, [esp+16] ; stride - mov edx, [esp+20] ; alpha - mov ecx, [esp+24] ; beta + mov edi, [picesp+12] ; pix + mov esi, [picesp+16] ; stride + mov edx, [picesp+20] ; alpha + mov ecx, [picesp+24] ; beta dec edx dec ecx mov eax, edi @@ -269,7 +269,7 @@ x264_deblock_v8_luma_mmxext: movq mm3, [edi+esi] ; q1 LOAD_MASK_MMX edx, ecx - mov ecx, [esp+44] ; tc0, use only the low 16 bits + mov ecx, [picesp+44] ; tc0, use only the low 16 bits movd mm4, [ecx] punpcklbw mm4, mm4 punpcklbw mm4, mm4 ; tc = 4x tc0[1], 4x tc0[0] @@ -310,7 +310,7 @@ x264_deblock_v8_luma_mmxext: add esp, 16 pop esi pop edi - POP_EBX_IF_PIC + picpop ebx ret @@ -430,7 +430,7 @@ x264_deblock_v_chroma_mmxext: movd mm6, [ebx] punpcklbw mm6, mm6 pand mm7, mm6 - GET_GOT_IN_EBX_IF_PIC ; no need to push ebx, it's already been done + picgetgot ebx ; no need to push ebx, it's already been done DEBLOCK_P0_Q0_MMX ; XXX: make sure ebx has the GOT in PIC mode movq [eax+esi], mm1 @@ -458,6 +458,7 @@ x264_deblock_h_chroma_mmxext: movd mm6, [ebx] punpcklbw mm6, mm6 pand mm7, mm6 + picgetgot ebx ; no need to push ebx, it's already been done DEBLOCK_P0_Q0_MMX ; XXX: make sure ebx has the GOT in PIC mode movq mm0, [esp+8] @@ -501,8 +502,8 @@ ALIGN 16 ;----------------------------------------------------------------------------- x264_deblock_v_chroma_intra_mmxext: CHROMA_V_START - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC + picpush ebx + picgetgot ebx movq mm0, [eax] movq mm1, [eax+esi] movq mm2, [edi] @@ -510,7 +511,7 @@ x264_deblock_v_chroma_intra_mmxext: CHROMA_INTRA_BODY ; XXX: make sure ebx has the GOT in PIC mode movq [eax+esi], mm1 movq [edi], mm2 - POP_EBX_IF_PIC + picpop ebx CHROMA_END ALIGN 16 @@ -519,13 +520,12 @@ ALIGN 16 ;----------------------------------------------------------------------------- x264_deblock_h_chroma_intra_mmxext: CHROMA_H_START - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC + picpush ebx + picgetgot ebx TRANSPOSE4x8_LOAD PASS8ROWS(eax, edi, esi, ebp) CHROMA_INTRA_BODY ; XXX: make sure ebx has the GOT in PIC mode TRANSPOSE8x4_STORE PASS8ROWS(eax, edi, esi, ebp) - POP_EBX_IF_PIC + picpop ebx pop ebp ; needed because of CHROMA_H_START - POP_EBX_IF_PIC CHROMA_END diff --git a/common/i386/i386inc.asm b/common/i386/i386inc.asm index 0495148c..cf9e845c 100644 --- a/common/i386/i386inc.asm +++ b/common/i386/i386inc.asm @@ -35,28 +35,61 @@ BITS 32 %endif %endmacro +; PIC support macros. All these macros are totally harmless when __PIC__ is +; not defined but can ruin everything if misused in PIC mode. On x86, shared +; objects cannot directly access global variables by address, they need to +; go through the GOT (global offset table). Most OSes do not care about it +; and let you load non-shared .so objects (Linux, Win32...). However, OS X +; requires PIC code in its .dylib objects. +; +; - GLOBAL should be used as a suffix for global addressing, eg. +; mov eax, [foo GLOBAL] +; instead of +; mov eax, [foo] +; +; - picgetgot computes the GOT address into the given register in PIC +; mode, otherwise does nothing. You need to do this before using GLOBAL. +; +; - picpush and picpop respectively push and pop the given register +; in PIC mode, otherwise do nothing. You should always use them around +; picgetgot except when sure that the register is no longer used and is +; being restored later by other means. +; +; - picesp is defined to compensate the changing of esp when pushing +; a register into the stack, eg. +; mov eax, [esp + 8] +; pushpic ebx +; mov eax, [picesp + 12] +; instead of +; mov eax, [esp + 8] +; pushpic ebx +; mov eax, [esp + 12] +; %ifdef __PIC__ extern _GLOBAL_OFFSET_TABLE_ - %define GLOBAL wrt ..gotpc - %macro GET_GOT_IN_EBX_IF_PIC 0 + ; FIXME: find an elegant way to use registers other than ebx + %define GLOBAL + ebx wrt ..gotoff + %macro picgetgot 1 call %%getgot %%getgot: - pop ebx - add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc + pop %1 + add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc %endmacro - %macro PUSH_EBX_IF_PIC 0 - push ebx + %macro picpush 1 + push %1 %endmacro - %macro POP_EBX_IF_PIC 0 - pop ebx + %macro picpop 1 + pop %1 %endmacro + %define picesp esp+4 %else %define GLOBAL - %macro GET_GOT_IN_EBX_IF_PIC 0 + %macro picgetgot 1 %endmacro - %macro PUSH_EBX_IF_PIC 0 + %macro picpush 1 %endmacro - %macro POP_EBX_IF_PIC 0 + %macro picpop 1 %endmacro + %define picesp esp %endif diff --git a/common/i386/mc-a.asm b/common/i386/mc-a.asm index 949b68c6..96b3ac2a 100644 --- a/common/i386/mc-a.asm +++ b/common/i386/mc-a.asm @@ -265,21 +265,21 @@ ALIGN 4 %macro BIWEIGHT_START_MMX 0 push edi push esi - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC - mov edi, [esp+12] ; dst - mov esi, [esp+16] ; i_dst - mov edx, [esp+20] ; src - mov ecx, [esp+24] ; i_src - - pshufw mm4, [esp+28], 0 ; weight_dst + picpush ebx + picgetgot ebx + mov edi, [picesp+12] ; dst + mov esi, [picesp+16] ; i_dst + mov edx, [picesp+20] ; src + mov ecx, [picesp+24] ; i_src + + pshufw mm4, [picesp+28], 0 ; weight_dst movq mm5, [pw_64 GLOBAL] psubw mm5, mm4 ; weight_src movq mm6, [pw_32 GLOBAL] ; rounding pxor mm7, mm7 %endmacro %macro BIWEIGHT_END_MMX 0 - POP_EBX_IF_PIC + picpop ebx pop esi pop edi ret @@ -291,7 +291,7 @@ ALIGN 16 ;----------------------------------------------------------------------------- x264_pixel_avg_weight_w16_mmxext: BIWEIGHT_START_MMX - mov eax, [esp+32] ; i_height + mov eax, [picesp+32] ; i_height ALIGN 4 .height_loop @@ -312,7 +312,7 @@ ALIGN 16 ;----------------------------------------------------------------------------- x264_pixel_avg_weight_w8_mmxext: BIWEIGHT_START_MMX - mov eax, [esp+32] + mov eax, [picesp+32] ALIGN 4 .height_loop @@ -512,13 +512,13 @@ ALIGN 16 x264_mc_chroma_mmxext: - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC + picpush ebx + picgetgot ebx pxor mm3, mm3 - pshufw mm5, [esp+20], 0 ; mm5 = dx - pshufw mm6, [esp+24], 0 ; mm6 = dy + pshufw mm5, [picesp+20], 0 ; mm5 = dx + pshufw mm6, [picesp+24], 0 ; mm6 = dy movq mm4, [pw_8 GLOBAL] movq mm0, mm4 @@ -534,10 +534,10 @@ x264_mc_chroma_mmxext: push edi - mov eax, [esp+4+4] ; src - mov edi, [esp+4+12] ; dst - mov ecx, [esp+4+8] ; i_src_stride - mov edx, [esp+4+32] ; i_height + mov eax, [picesp+4+4] ; src + mov edi, [picesp+4+12] ; dst + mov ecx, [picesp+4+8] ; i_src_stride + mov edx, [picesp+4+32] ; i_height ALIGN 4 .height_loop @@ -568,22 +568,22 @@ ALIGN 4 movd [edi], mm0 add eax, ecx - add edi, [esp+4+16] + add edi, [picesp+4+16] dec edx jnz .height_loop - sub [esp+4+28], dword 8 + sub [picesp+4+28], dword 8 jnz .finish ; width != 8 so assume 4 - mov edi, [esp+4+12] ; dst - mov eax, [esp+4+4] ; src - mov edx, [esp+4+32] ; i_height + mov edi, [picesp+4+12] ; dst + mov eax, [picesp+4+4] ; src + mov edx, [picesp+4+32] ; i_height add edi, 4 add eax, 4 jmp .height_loop .finish pop edi - POP_EBX_IF_PIC + picpop ebx ret diff --git a/common/i386/mc-a2.asm b/common/i386/mc-a2.asm index 085c48d3..55f0d2d6 100644 --- a/common/i386/mc-a2.asm +++ b/common/i386/mc-a2.asm @@ -169,8 +169,8 @@ x264_center_filter_mmxext : lea ebx, [ecx + ecx * 2] ; 3 * src_stride lea edx, [ecx + ecx * 4] ; 5 * src_stride - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC + picpush ebx + picgetgot ebx pxor mm0, mm0 ; 0 ---> mm0 movq mm7, [mmx_dd_one GLOBAL] ; for rounding @@ -178,21 +178,23 @@ x264_center_filter_mmxext : loopcy: -; mov eax, [esp + twidth] +; mov eax, [picesp + twidth] xor eax, eax - mov edi, [esp + tdst1] - lea ebp, [esp + tbuffer] - mov esi, [esp + tsrc] + mov edi, [picesp + tdst1] + lea ebp, [picesp + tbuffer] + mov esi, [picesp + tsrc] + + ; Overwrite mm7, the value set above is never used + movd mm7, [mmx_dw_one GLOBAL] + + picpop ebx - POP_EBX_IF_PIC FILT_ALL esi - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC pshufw mm2, mm1, 0 movq [ebp + 8], mm1 movq [ebp], mm2 - paddw mm1, [mmx_dw_one GLOBAL] + paddw mm1, mm7 psraw mm1, 5 packuswb mm1, mm1 @@ -203,13 +205,10 @@ loopcy: loopcx1: - POP_EBX_IF_PIC FILT_ALL esi - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC movq [ebp + 2 * eax], mm1 - paddw mm1, [mmx_dw_one GLOBAL] + paddw mm1, mm7 psraw mm1, 5 packuswb mm1, mm1 movd [edi + eax - 4], mm1 @@ -219,15 +218,12 @@ loopcx1: cmp eax, [esp + twidth] jnz loopcx1 - POP_EBX_IF_PIC FILT_ALL esi - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC pshufw mm2, mm1, 7 movq [ebp + 2 * eax], mm1 movq [ebp + 2 * eax + 8], mm2 - paddw mm1, [mmx_dw_one GLOBAL] + paddw mm1, mm7 psraw mm1, 5 packuswb mm1, mm1 movd [edi + eax - 4], mm1 @@ -242,14 +238,17 @@ loopcx1: mov edi, [esp + tdst2] xor eax, eax + picpush ebx + picgetgot ebx + loopcx2: - movq mm2, [esp + 2 * eax + 2 + 4 + tbuffer] - movq mm3, [esp + 2 * eax + 4 + 4 + tbuffer] - movq mm4, [esp + 2 * eax + 6 + 4 + tbuffer] - movq mm5, [esp + 2 * eax + 8 + 4 + tbuffer] - movq mm1, [esp + 2 * eax + 4 + tbuffer] - movq mm6, [esp + 2 * eax + 10 + 4 + tbuffer] + movq mm2, [picesp + 2 * eax + 2 + 4 + tbuffer] + movq mm3, [picesp + 2 * eax + 4 + 4 + tbuffer] + movq mm4, [picesp + 2 * eax + 6 + 4 + tbuffer] + movq mm5, [picesp + 2 * eax + 8 + 4 + tbuffer] + movq mm1, [picesp + 2 * eax + 4 + tbuffer] + movq mm6, [picesp + 2 * eax + 10 + 4 + tbuffer] paddw mm2, mm5 paddw mm3, mm4 paddw mm1, mm6 @@ -287,19 +286,19 @@ loopcx2: movd [edi + eax], mm2 add eax, 4 - cmp eax, [esp + twidth] + cmp eax, [picesp + twidth] jnz loopcx2 - add edi, [esp + tdstp2] - mov [esp + tdst2], edi + add edi, [picesp + tdstp2] + mov [picesp + tdst2], edi - mov ebp, [esp + theight] + mov ebp, [picesp + theight] dec ebp test ebp, ebp - mov [esp + theight], ebp + mov [picesp + theight], ebp jnz loopcy - POP_EBX_IF_PIC + picpop ebx add esp, [esp + toffset] @@ -327,10 +326,10 @@ x264_horizontal_filter_mmxext : mov esi, [esp + 20] ; src pxor mm0, mm0 - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC + picpush ebx + picgetgot ebx movq mm7, [mmx_dw_one GLOBAL] - POP_EBX_IF_PIC + picpop ebx mov ecx, [esp + 32] ; height diff --git a/common/i386/pixel-sse2.asm b/common/i386/pixel-sse2.asm index ca860ebd..2215a96e 100644 --- a/common/i386/pixel-sse2.asm +++ b/common/i386/pixel-sse2.asm @@ -402,7 +402,7 @@ x264_pixel_ssd_16x8_sse2: %macro SUM_MM_SSE2 2 ; sum junk ; ebx is no longer used at this point, so no push needed - GET_GOT_IN_EBX_IF_PIC + picgetgot ebx ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first. psrlw %1, 1 movdqa %2, %1 diff --git a/common/i386/predict-a.asm b/common/i386/predict-a.asm index 2cdb63fa..53a16275 100644 --- a/common/i386/predict-a.asm +++ b/common/i386/predict-a.asm @@ -78,9 +78,9 @@ cglobal predict_16x16_dc_top_mmxext %endmacro %macro PRED8x8_LOAD_TOP 0 - mov edx, [esp + 4] - mov ecx, [esp + 8] - mov eax, [esp +12] + mov edx, [picesp + 4] + mov ecx, [picesp + 8] + mov eax, [picesp +12] sub edx, ecx and eax, 12 @@ -92,7 +92,7 @@ cglobal predict_16x16_dc_top_mmxext mov al, [edx] mov ah, [edx] pinsrw mm1, ax, 0 - mov eax, [esp +12] + mov eax, [picesp + 12] .have_topleft: and eax, byte 4 @@ -113,8 +113,8 @@ cglobal predict_16x16_dc_top_mmxext ALIGN 16 predict_8x8_v_mmxext: - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC + picpush ebx + picgetgot ebx PRED8x8_LOAD_TOP lea eax, [ecx + 2*ecx] @@ -128,7 +128,7 @@ predict_8x8_v_mmxext: movq [edx + eax], mm0 ; 5 movq [edx + 4*ecx], mm0 ; 6 - POP_EBX_IF_PIC + picpop ebx ret ;----------------------------------------------------------------------------- @@ -139,10 +139,10 @@ predict_8x8_v_mmxext: ALIGN 16 predict_8x8_dc_core_mmxext: - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC + picpush ebx + picgetgot ebx - mov eax, [esp + 16] + mov eax, [picesp + 16] movq mm1, [eax-1] movq mm2, [eax+1] PRED8x8_LOWPASS mm4, [eax] @@ -169,7 +169,7 @@ predict_8x8_dc_core_mmxext: movq [edx + eax], mm0 ; 5 movq [edx + 4*ecx], mm0 ; 6 - POP_EBX_IF_PIC + picpop ebx ret ;----------------------------------------------------------------------------- @@ -207,11 +207,11 @@ predict_8x8c_v_mmx : ALIGN 16 predict_8x8c_dc_core_mmxext: - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC + picpush ebx + picgetgot ebx - mov edx, [esp + 4] - mov ecx, [esp + 8] + mov edx, [picesp + 4] + mov ecx, [picesp + 8] sub edx, ecx lea eax, [ecx + 2*ecx] @@ -223,8 +223,8 @@ predict_8x8c_dc_core_mmxext: psadbw mm1, mm2 ; s1 psadbw mm0, mm2 ; s0 - paddw mm0, [esp + 12] - pshufw mm2, [esp + 16], 0 + paddw mm0, [picesp + 12] + pshufw mm2, [picesp + 16], 0 psrlw mm0, 3 paddw mm1, [pw_2 GLOBAL] movq mm3, mm2 @@ -248,7 +248,7 @@ predict_8x8c_dc_core_mmxext: movq [edx + eax], mm2 ; 6 movq [edx + 4*ecx], mm2 ; 7 - POP_EBX_IF_PIC + picpop ebx ret ;----------------------------------------------------------------------------- @@ -259,15 +259,15 @@ predict_8x8c_dc_core_mmxext: ALIGN 16 predict_8x8c_p_core_mmx: - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC + picpush ebx + picgetgot ebx - mov edx, [esp + 4] - mov ecx, [esp + 8] + mov edx, [picesp + 4] + mov ecx, [picesp + 8] - movd mm0, [esp +12] - movd mm2, [esp +16] - movd mm4, [esp +20] + movd mm0, [picesp +12] + movd mm2, [picesp +16] + movd mm4, [picesp +20] pshufw mm0, mm0, 0 pshufw mm2, mm2, 0 pshufw mm4, mm4, 0 @@ -297,7 +297,7 @@ ALIGN 4 jg .loop nop - POP_EBX_IF_PIC + picpop ebx ret ;----------------------------------------------------------------------------- @@ -309,15 +309,15 @@ ALIGN 4 ALIGN 16 predict_16x16_p_core_mmx: - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC + picpush ebx + picgetgot ebx - mov edx, [esp + 4] - mov ecx, [esp + 8] + mov edx, [picesp + 4] + mov ecx, [picesp + 8] - movd mm0, [esp +12] - movd mm2, [esp +16] - movd mm4, [esp +20] + movd mm0, [picesp +12] + movd mm2, [picesp +16] + movd mm4, [picesp +20] pshufw mm0, mm0, 0 ; FIXME shuf these directly from memory pshufw mm2, mm2, 0 ; if there is stack alignment? pshufw mm4, mm4, 0 @@ -363,7 +363,7 @@ ALIGN 4 jg .loop nop - POP_EBX_IF_PIC + picpop ebx ret ;----------------------------------------------------------------------------- @@ -411,9 +411,9 @@ predict_16x16_v_mmx : ; ;----------------------------------------------------------------------------- -%macro PRED16x16_DC 2 - mov edx, [esp + 4] - mov ecx, [esp + 8] +%macro PRED16x16_DC 3 + mov edx, [%3 + 4] + mov ecx, [%3 + 8] sub edx, ecx ; edx <-- line -1 pxor mm0, mm0 @@ -444,14 +444,14 @@ ALIGN 4 ALIGN 16 predict_16x16_dc_core_mmxext: - PRED16x16_DC [esp+12], 5 + PRED16x16_DC [esp+12], 5, esp ret ALIGN 16 predict_16x16_dc_top_mmxext: - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC - PRED16x16_DC [pw_8 GLOBAL], 4 - POP_EBX_IF_PIC + picpush ebx + picgetgot ebx + PRED16x16_DC [pw_8 GLOBAL], 4, picesp + picpop ebx ret diff --git a/common/i386/quant-a.asm b/common/i386/quant-a.asm index b24d9c46..d5634d82 100644 --- a/common/i386/quant-a.asm +++ b/common/i386/quant-a.asm @@ -482,10 +482,10 @@ ALIGN 16 .rshift16: neg eax - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC + picpush ebx + picgetgot ebx movq mm6, [pw_1 GLOBAL] - POP_EBX_IF_PIC + picpop ebx movd mm5, eax pxor mm7, mm7 psllw mm6, mm5 @@ -504,10 +504,10 @@ ALIGN 16 .rshift32: neg eax - PUSH_EBX_IF_PIC - GET_GOT_IN_EBX_IF_PIC + picpush ebx + picgetgot ebx movq mm6, [pd_1 GLOBAL] - POP_EBX_IF_PIC + picpop ebx movd mm5, eax pxor mm7, mm7 pslld mm6, mm5 -- 2.40.0