From: Loren Merritt Date: Sun, 2 Mar 2008 03:04:07 +0000 (+0000) Subject: pic macros now keep track of which register holds the GOT, so variable access doesn... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9dce08ac53aa22695d1934bc321122863ac3739e;p=libx264 pic macros now keep track of which register holds the GOT, so variable access doesn't have to care git-svn-id: svn://svn.videolan.org/x264/trunk@745 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/i386/dct-a.asm b/common/i386/dct-a.asm index 53532756..cfa64f31 100644 --- a/common/i386/dct-a.asm +++ b/common/i386/dct-a.asm @@ -151,7 +151,7 @@ cglobal x264_dct4x4dc_mmx MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 - movq mm6, [pw_1 GOT_edx] + movq mm6, [pw_1 GLOBAL] paddw mm0, mm6 paddw mm2, mm6 psraw mm0, 1 @@ -255,7 +255,7 @@ cglobal x264_add4x4_idct_mmx MMX_SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 MMX_ZERO mm7 - movq mm6, [pw_32 GOT_edx] + movq mm6, [pw_32 GLOBAL] MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [eax+0*FDEC_STRIDE] MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [eax+1*FDEC_STRIDE] @@ -693,7 +693,7 @@ cglobal x264_add8x8_idct8_sse2 IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 TRANSPOSE8 xmm4, xmm1, xmm7, xmm3, xmm5, xmm0, xmm2, xmm6, eax picgetgot edx - paddw xmm4, [pw_32 GOT_edx] + paddw xmm4, [pw_32 GLOBAL] movdqa [eax+0x00], xmm4 movdqa [eax+0x40], xmm2 IDCT8_1D xmm4, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7, xmm1 diff --git a/common/i386/deblock-a.asm b/common/i386/deblock-a.asm index 518e61cb..f4c52c43 100644 --- a/common/i386/deblock-a.asm +++ b/common/i386/deblock-a.asm @@ -183,18 +183,18 @@ SECTION .text %macro DEBLOCK_P0_Q0_MMX 0 movq mm5, mm1 pxor mm5, mm2 ; p0^q0 - pand mm5, [pb_01 GOT_ebx] ; (p0^q0)&1 + pand mm5, [pb_01 GLOBAL] ; (p0^q0)&1 pcmpeqb mm4, mm4 pxor mm3, mm4 pavgb mm3, mm0 ; (p1 - q1 + 256)>>1 - pavgb mm3, [pb_03 GOT_ebx] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 + pavgb mm3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 pxor mm4, mm1 pavgb mm4, mm2 ; (q0 - p0 + 256)>>1 pavgb mm3, mm5 paddusb mm3, mm4 ; d+128+33 - movq mm6, [pb_a1 GOT_ebx] + movq mm6, [pb_a1 GLOBAL] psubusb mm6, mm3 - psubusb mm3, [pb_a1 GOT_ebx] + psubusb mm3, [pb_a1 GLOBAL] pminub mm6, mm7 pminub mm3, mm7 psubusb mm1, mm6 @@ -212,7 +212,7 @@ SECTION .text pavgb %6, mm2 pavgb %2, %6 ; avg(p2,avg(p0,q0)) pxor %6, %3 - pand %6, [pb_01 GOT_ebx] ; (p2^avg(p0,q0))&1 + pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 movq %6, %1 psubusb %6, %5 @@ -280,7 +280,7 @@ cglobal x264_deblock_v8_luma_mmxext movq mm3, [edi+esi] LUMA_Q1_MMX mm3, mm4, [edi+2*esi], [edi+esi], mm5, mm6 - DEBLOCK_P0_Q0_MMX ; XXX: make sure ebx has the GOT in PIC mode + DEBLOCK_P0_Q0_MMX movq [eax+2*esi], mm1 movq [edi], mm2 @@ -406,7 +406,7 @@ cglobal x264_deblock_v_chroma_mmxext punpcklbw mm6, mm6 pand mm7, mm6 picgetgot ebx ; no need to push ebx, it's already been done - DEBLOCK_P0_Q0_MMX ; XXX: make sure ebx has the GOT in PIC mode + DEBLOCK_P0_Q0_MMX movq [eax+esi], mm1 movq [edi], mm2 @@ -433,7 +433,7 @@ cglobal x264_deblock_h_chroma_mmxext punpcklbw mm6, mm6 pand mm7, mm6 picgetgot ebx ; no need to push ebx, it's already been done - DEBLOCK_P0_Q0_MMX ; XXX: make sure ebx has the GOT in PIC mode + DEBLOCK_P0_Q0_MMX movq mm0, [esp+8] movq mm3, [esp+0] @@ -450,7 +450,7 @@ cglobal x264_deblock_h_chroma_mmxext %macro CHROMA_INTRA_P0 3 movq mm4, %1 pxor mm4, %3 - pand mm4, [pb_01 GOT_ebx] ; mm4 = (p0^q1)&1 + pand mm4, [pb_01 GLOBAL] ; mm4 = (p0^q1)&1 pavgb %1, %3 psubusb %1, mm4 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) @@ -481,7 +481,7 @@ cglobal x264_deblock_v_chroma_intra_mmxext movq mm1, [eax+esi] movq mm2, [edi] movq mm3, [edi+esi] - CHROMA_INTRA_BODY ; XXX: make sure ebx has the GOT in PIC mode + CHROMA_INTRA_BODY movq [eax+esi], mm1 movq [edi], mm2 picpop ebx @@ -495,7 +495,7 @@ cglobal x264_deblock_h_chroma_intra_mmxext picpush ebx picgetgot ebx TRANSPOSE4x8_LOAD PASS8ROWS(eax, edi, esi, ebp) - CHROMA_INTRA_BODY ; XXX: make sure ebx has the GOT in PIC mode + CHROMA_INTRA_BODY TRANSPOSE8x4_STORE PASS8ROWS(eax, edi, esi, ebp) picpop ebx pop ebp ; needed because of CHROMA_H_START diff --git a/common/i386/i386inc.asm b/common/i386/i386inc.asm index e56f5940..c5a97b46 100644 --- a/common/i386/i386inc.asm +++ b/common/i386/i386inc.asm @@ -66,14 +66,16 @@ BITS 32 ; and let you load non-shared .so objects (Linux, Win32...). However, OS X ; requires PIC code in its .dylib objects. ; -; - GOT_* should be used as a suffix for global addressing, eg. +; - GLOBAL should be used as a suffix for global addressing, eg. ; picgetgot ebx -; mov eax, [foo GOT_ebx] +; mov eax, [foo GLOBAL] ; instead of ; mov eax, [foo] ; ; - picgetgot computes the GOT address into the given register in PIC -; mode, otherwise does nothing. You need to do this before using GOT_*. +; mode, otherwise does nothing. You need to do this before using GLOBAL. +; Before in both execution order and compiled code order (so GLOBAL knows +; which register the GOT is in). ; ; - picpush and picpop respectively push and pop the given register ; in PIC mode, otherwise do nothing. You should always use them around @@ -94,15 +96,14 @@ BITS 32 %ifidn __OUTPUT_FORMAT__,macho ; There is no real global offset table on OS X, but we still ; need to reference our variables by offset. - %define GOT_eax - fakegot + eax - %define GOT_ebx - fakegot + ebx - %define GOT_ecx - fakegot + ecx - %define GOT_edx - fakegot + edx + %define GOT_reg(x) - fakegot + x %macro picgetgot 1 call %%getgot %%getgot: pop %1 add %1, $$ - %%getgot + %undef GLOBAL + %define GLOBAL GOT_reg(%1) %endmacro %else %ifidn __OUTPUT_FORMAT__,elf @@ -111,15 +112,14 @@ BITS 32 %define GOT __GLOBAL_OFFSET_TABLE_ %endif extern GOT - %define GOT_eax + eax wrt ..gotoff - %define GOT_ebx + ebx wrt ..gotoff - %define GOT_ecx + ecx wrt ..gotoff - %define GOT_edx + edx wrt ..gotoff + %define GOT_reg(x) + x wrt ..gotoff %macro picgetgot 1 call %%getgot %%getgot: pop %1 add %1, GOT + $$ - %%getgot wrt ..gotpc + %undef GLOBAL + %define GLOBAL GOT_reg(%1) %endmacro %endif %macro picpush 1 @@ -130,10 +130,7 @@ BITS 32 %endmacro %define picesp esp+4 %else - %define GOT_eax - %define GOT_ebx - %define GOT_ecx - %define GOT_edx + %define GLOBAL %macro picgetgot 1 %endmacro %macro picpush 1 diff --git a/common/i386/mc-a.asm b/common/i386/mc-a.asm index 2622b643..3a34e2ea 100644 --- a/common/i386/mc-a.asm +++ b/common/i386/mc-a.asm @@ -254,8 +254,8 @@ AVG2_END push edi push esi picgetgot ecx - movq mm5, [pw_64 GOT_ecx] - movq mm6, [pw_32 GOT_ecx] ; rounding + movq mm5, [pw_64 GLOBAL] + movq mm6, [pw_32 GLOBAL] ; rounding mov edi, [esp+12] ; dst mov esi, [esp+16] ; i_dst mov edx, [esp+20] ; src @@ -512,7 +512,7 @@ cglobal x264_mc_chroma_mmxext pshufw mm5, mm5, 0 ; mm5 = dx&7 pshufw mm6, mm6, 0 ; mm6 = dy&7 - movq mm4, [pw_8 GOT_ebx] + movq mm4, [pw_8 GLOBAL] movq mm0, mm4 psubw mm4, mm5 ; mm4 = 8-dx @@ -546,7 +546,7 @@ ALIGN 4 punpcklbw mm2, mm3 punpcklbw mm1, mm3 - paddw mm0, [pw_32 GOT_ebx] + paddw mm0, [pw_32 GLOBAL] pmullw mm2, mm5 ; line * cB pmullw mm1, mm7 ; line * cD diff --git a/common/i386/mc-a2.asm b/common/i386/mc-a2.asm index 211427b2..fb003997 100644 --- a/common/i386/mc-a2.asm +++ b/common/i386/mc-a2.asm @@ -134,9 +134,9 @@ cglobal x264_hpel_filter_mmxext pxor mm0, mm0 ; mov globals onto the stack, to free up ebx - movq mm1, [pw_1 GOT_ebx] - movq mm2, [pw_16 GOT_ebx] - movq mm3, [pw_32 GOT_ebx] + movq mm1, [pw_1 GLOBAL] + movq mm2, [pw_16 GLOBAL] + movq mm3, [pw_32 GLOBAL] movq [tpw_1], mm1 movq [tpw_16], mm2 movq [tpw_32], mm3 diff --git a/common/i386/pixel-sse2.asm b/common/i386/pixel-sse2.asm index 9ab23c61..bb2e9c57 100644 --- a/common/i386/pixel-sse2.asm +++ b/common/i386/pixel-sse2.asm @@ -43,7 +43,7 @@ SECTION .text %macro HADDW 2 ; sum junk ; ebx is no longer used at this point, so no push needed picgetgot ebx - pmaddwd %1, [pw_1 GOT_ebx] + pmaddwd %1, [pw_1 GLOBAL] movhlps %2, %1 paddd %1, %2 pshuflw %2, %1, 0xE @@ -413,7 +413,7 @@ cglobal x264_pixel_sad_16x%2_cache64_%1 shl eax, 4 %endif picgetgot ebx - lea edi, [sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1) + eax GOT_ebx] + lea edi, [sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1) + eax GLOBAL] mov eax, [esp+16] mov ebx, [esp+20] mov ecx, [esp+24] @@ -966,7 +966,7 @@ cglobal x264_pixel_ssim_4x4x2_core_sse2 ; PHADDD xmm3, xmm4 mov eax, [esp+24] picgetgot ebx - movdqa xmm7, [pw_1 GOT_ebx] + movdqa xmm7, [pw_1 GLOBAL] pshufd xmm5, xmm3, 0xB1 pmaddwd xmm1, xmm7 pmaddwd xmm2, xmm7 @@ -1010,8 +1010,8 @@ cglobal x264_pixel_ssim_end4_sse2 paddd xmm1, xmm2 paddd xmm2, xmm3 paddd xmm3, xmm4 - movdqa xmm5, [ssim_c1 GOT_ebx] - movdqa xmm6, [ssim_c2 GOT_ebx] + movdqa xmm5, [ssim_c1 GLOBAL] + movdqa xmm6, [ssim_c2 GLOBAL] TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4 ; s1=mm0, s2=mm3, ss=mm4, s12=mm2 @@ -1038,7 +1038,7 @@ cglobal x264_pixel_ssim_end4_sse2 divps xmm1, xmm0 ; ssim neg edx - movdqu xmm3, [mask_ff + edx*4 + 16 GOT_ebx] + movdqu xmm3, [mask_ff + edx*4 + 16 GLOBAL] pand xmm1, xmm3 movhlps xmm0, xmm1 addps xmm0, xmm1 diff --git a/common/i386/predict-a.asm b/common/i386/predict-a.asm index 6dc71114..4c58357d 100644 --- a/common/i386/predict-a.asm +++ b/common/i386/predict-a.asm @@ -97,7 +97,7 @@ SECTION .text pavgb %2, %3 pxor %3, %5 mov%6 %1, %4 - pand %3, [pb_1 GOT_ecx] + pand %3, [pb_1 GLOBAL] psubusb %2, %3 pavgb %1, %2 %endmacro @@ -118,7 +118,7 @@ cglobal predict_4x4_ddl_mmxext movq mm3, [eax - FDEC_STRIDE ] movq mm1, [eax - FDEC_STRIDE - 1] movq mm2, mm3 - movq mm4, [pb_0s_ff GOT_ecx] + movq mm4, [pb_0s_ff GLOBAL] psrlq mm2, 8 pand mm4, mm3 por mm2, mm4 @@ -176,7 +176,7 @@ cglobal predict_8x8_dc_mmxext pxor mm1, mm1 psadbw mm0, [eax+7] psadbw mm1, [eax+16] - paddw mm0, [pw_8 GOT_ecx] + paddw mm0, [pw_8 GLOBAL] paddw mm0, mm1 psrlw mm0, 4 pshufw mm0, mm0, 0 @@ -194,7 +194,7 @@ cglobal %1 mov edx, [esp + 4] pxor mm0, mm0 psadbw mm0, [eax+%2] - paddw mm0, [pw_4 GOT_ecx] + paddw mm0, [pw_4 GLOBAL] psrlw mm0, 3 pshufw mm0, mm0, 0 packuswb mm0, mm0 @@ -338,7 +338,7 @@ cglobal predict_8x8c_dc_core_mmxext paddw mm0, [esp + 8] pshufw mm2, [esp + 12], 0 psrlw mm0, 3 - paddw mm1, [pw_2 GOT_ecx] + paddw mm1, [pw_2 GLOBAL] movq mm3, mm2 pshufw mm1, mm1, 0 pshufw mm0, mm0, 0 ; dc0 (w) @@ -364,7 +364,7 @@ cglobal predict_8x8c_p_core_mmxext pshufw mm2, [esp +12], 0 pshufw mm4, [esp +16], 0 movq mm1, mm2 - pmullw mm2, [pw_3210 GOT_ecx] + pmullw mm2, [pw_3210 GLOBAL] psllw mm1, 2 paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b} paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b} @@ -400,7 +400,7 @@ cglobal predict_16x16_p_core_mmxext pshufw mm4, [esp +16], 0 movq mm5, mm2 movq mm1, mm2 - pmullw mm5, [pw_3210 GOT_ecx] + pmullw mm5, [pw_3210 GLOBAL] psllw mm2, 3 psllw mm1, 2 movq mm3, mm2 @@ -454,7 +454,7 @@ cglobal predict_16x16_p_core_sse2 punpcklqdq xmm1, xmm1 punpcklqdq xmm2, xmm2 movdqa xmm3, xmm1 - pmullw xmm3, [pw_76543210 GOT_ecx] + pmullw xmm3, [pw_76543210 GLOBAL] psllw xmm1, 3 paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} @@ -521,7 +521,7 @@ cglobal predict_16x16_dc_core_mmxext cglobal predict_16x16_dc_top_mmxext picgetgot ecx - PRED16x16_DC [pw_8 GOT_ecx], 4 + PRED16x16_DC [pw_8 GLOBAL], 4 ret ;----------------------------------------------------------------------------- @@ -549,7 +549,7 @@ cglobal predict_16x16_dc_core_sse2 cglobal predict_16x16_dc_top_sse2 picgetgot ecx - PRED16x16_DC_SSE2 [pw_8 GOT_ecx], 4 + PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4 ret ;----------------------------------------------------------------------------- diff --git a/common/i386/quant-a.asm b/common/i386/quant-a.asm index dc7611ae..0c59361f 100644 --- a/common/i386/quant-a.asm +++ b/common/i386/quant-a.asm @@ -277,7 +277,7 @@ cglobal %1 neg eax movd mm5, eax picgetgot eax - movq mm6, [pd_1 GOT_eax] + movq mm6, [pd_1 GLOBAL] pxor mm7, mm7 pslld mm6, mm5 psrld mm6, 1