MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
- movq mm6, [x264_mmx_1 GLOBAL]
+ movq mm6, [x264_mmx_1 GOT_ebx]
paddw mm0, mm6
paddw mm4, mm6
psraw mm0, 1
MMX_SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13
MMX_ZERO mm7
- movq mm6, [x264_mmx_32 GLOBAL]
+ movq mm6, [x264_mmx_32 GOT_ebx]
MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [eax]
MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [eax+ecx]
picpush ebx
picgetgot ebx
- movq mm5, [x264_mmx_PPNN GLOBAL]
- movq mm6, [x264_mmx_PNNP GLOBAL]
- movq mm4, [x264_mmx_PPPN GLOBAL]
- movq mm7, [x264_mmx_PPNP GLOBAL]
+ movq mm5, [x264_mmx_PPNN GOT_ebx]
+ movq mm6, [x264_mmx_PNNP GOT_ebx]
+ movq mm4, [x264_mmx_PPPN GOT_ebx]
+ movq mm7, [x264_mmx_PPNP GOT_ebx]
;-------------------------------------------------------------------------
; horizontal dct ( compute 1 row at a time -> 8 loops )
pshufw mm2, mm0, 11001001b ; (low)a1/a3/a0/a2(high)
pshufw mm0, mm0, 10011100b ; (low)a0/a2/a1/a3(high)
- pmullw mm2, [x264_mmx_2121 GLOBAL]
+ pmullw mm2, [x264_mmx_2121 GOT_ebx]
pmullw mm0, mm5 ; (low)a0/a2/-a1/-a3(high)
psraw mm2, 1 ; (low)a1/a3>>1/a0/a2>>1(high)
paddw mm0, mm2 ; (low)dst0/dst2/dst4/dst6(high)
picpush ebx
picgetgot ebx
- movq mm4, [x264_mmx_PPNN GLOBAL]
- movq mm5, [x264_mmx_PNPN GLOBAL]
- movq mm6, [x264_mmx_PPNP GLOBAL]
- movq mm7, [x264_mmx_PPPN GLOBAL]
+ movq mm4, [x264_mmx_PPNN GOT_ebx]
+ movq mm5, [x264_mmx_PNPN GOT_ebx]
+ movq mm6, [x264_mmx_PPNP GOT_ebx]
+ movq mm7, [x264_mmx_PPPN GOT_ebx]
;-------------------------------------------------------------------------
; horizontal idct ( compute 1 row at a time -> 8 loops )
punpckhwd mm1, mm2 ; (low)d1,d5,d3,d7(high)
pshufw mm2, mm0, 10110001b ; (low)d4,d0,d6,d2(high)
- pmullw mm0, [x264_mmx_p2n2p1p1 GLOBAL]; (low)2*d0,-2*d4,d2,d6(high)
+ pmullw mm0, [x264_mmx_p2n2p1p1 GOT_ebx]; (low)2*d0,-2*d4,d2,d6(high)
pmullw mm2, mm6 ; (low)d4,d0,-d6,d2(high)
psraw mm0, 1 ; (low)d0,-d4,d2>>1,d6>>1(high)
paddw mm0, mm2 ; (low)e0,e2,e4,e6(high)
pxor mm4, mm2
; b = p0^(q1>>2)
psrlw mm3, 2
- pand mm3, [pb_3f GLOBAL]
+ pand mm3, [pb_3f GOT_ebx]
movq mm5, mm1
pxor mm5, mm3
; c = q0^(p1>>2)
psrlw mm0, 2
- pand mm0, [pb_3f GLOBAL]
+ pand mm0, [pb_3f GOT_ebx]
movq mm6, mm2
pxor mm6, mm0
; d = (c^b) & ~(b^a) & 1
pxor mm6, mm5
pxor mm5, mm4
pandn mm5, mm6
- pand mm5, [pb_01 GLOBAL]
+ pand mm5, [pb_01 GOT_ebx]
; delta = (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3
; = (avg(q0, p1>>2) + (d&a))
; - (avg(p0, q1>>2) + (d^(d&a)))
%macro LUMA_Q1_MMX 6
movq %6, mm1
pavgb %6, mm2
- pavgb %2, %6 ; avg(p2,avg(p0,q0))
+ pavgb %2, %6 ; avg(p2,avg(p0,q0))
pxor %6, %3
- pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
- psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
+ pand %6, [pb_01 GOT_ebx] ; (p2^avg(p0,q0))&1
+ psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
movq %6, %1
psubusb %6, %5
paddusb %5, %1
punpcklbw mm4, mm4
punpcklbw mm4, mm4 ; tc = 4x tc0[1], 4x tc0[0]
movq [esp+8], mm4 ; tc
- pcmpgtb mm4, [pb_ff GLOBAL]
+ pcmpgtb mm4, [pb_ff GOT_ebx]
pand mm4, mm7
movq [esp+0], mm4 ; mask
pcmpeqb mm6, mm4
pand mm6, mm4
pand mm4, [esp+8] ; tc
- movq mm7, [pb_01 GLOBAL]
+ movq mm7, [pb_01 GOT_ebx]
pand mm7, mm6
pand mm6, mm4
paddb mm7, mm4
pand mm6, mm5
movq mm5, [esp+8] ; tc
pand mm5, mm6
- pand mm6, [pb_01 GLOBAL]
+ pand mm6, [pb_01 GOT_ebx]
paddb mm7, mm6
movq mm3, [edi+esi]
LUMA_Q1_MMX mm3, mm4, [edi+2*esi], [edi+esi], mm5, mm6
%macro CHROMA_INTRA_P0 3
movq mm4, %1
pxor mm4, %3
- pand mm4, [pb_01 GLOBAL] ; mm4 = (p0^q1)&1
+ pand mm4, [pb_01 GOT_ebx] ; mm4 = (p0^q1)&1
pavgb %1, %3
psubusb %1, mm4
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
; and let you load non-shared .so objects (Linux, Win32...). However, OS X
; requires PIC code in its .dylib objects.
;
-; - GLOBAL should be used as a suffix for global addressing, eg.
-; mov eax, [foo GLOBAL]
+; - GOT_* should be used as a suffix for global addressing, eg.
+; picgetgot ebx
+; mov eax, [foo GOT_ebx]
; instead of
; mov eax, [foo]
;
; - picgetgot computes the GOT address into the given register in PIC
-; mode, otherwise does nothing. You need to do this before using GLOBAL.
+; mode, otherwise does nothing. You need to do this before using GOT_*.
;
; - picpush and picpop respectively push and pop the given register
; in PIC mode, otherwise do nothing. You should always use them around
%ifidn __OUTPUT_FORMAT__,macho
; There is no real global offset table on OS X, but we still
; need to reference our variables by offset.
- %define GLOBAL + ebx
+ %define GOT_eax + eax
+ %define GOT_ebx + ebx
+ %define GOT_ecx + ecx
+ %define GOT_edx + edx
%macro picgetgot 1
call %%getgot
%%getgot:
%define GOT __GLOBAL_OFFSET_TABLE_
%endif
extern GOT
- ; FIXME: find an elegant way to use registers other than ebx
- %define GLOBAL + ebx wrt ..gotoff
+ %define GOT_eax + eax wrt ..gotoff
+ %define GOT_ebx + ebx wrt ..gotoff
+ %define GOT_ecx + ecx wrt ..gotoff
+ %define GOT_edx + edx wrt ..gotoff
%macro picgetgot 1
call %%getgot
%%getgot:
%endmacro
%define picesp esp+4
%else
- %define GLOBAL
+ %define GOT_eax
+ %define GOT_ebx
+ %define GOT_ecx
+ %define GOT_edx
%macro picgetgot 1
%endmacro
%macro picpush 1
mov edx, [picesp+20] ; src
mov ecx, [picesp+24] ; i_src
- pshufw mm4, [picesp+28], 0 ; weight_dst
- movq mm5, [pw_64 GLOBAL]
- psubw mm5, mm4 ; weight_src
- movq mm6, [pw_32 GLOBAL] ; rounding
+ pshufw mm4, [picesp+28], 0 ; weight_dst
+ movq mm5, [pw_64 GOT_ebx]
+ psubw mm5, mm4 ; weight_src
+ movq mm6, [pw_32 GOT_ebx] ; rounding
pxor mm7, mm7
%endmacro
%macro BIWEIGHT_END_MMX 0
pshufw mm5, [picesp+20], 0 ; mm5 = dx
pshufw mm6, [picesp+24], 0 ; mm6 = dy
- movq mm4, [pw_8 GLOBAL]
+ movq mm4, [pw_8 GOT_ebx]
movq mm0, mm4
psubw mm4, mm5 ; mm4 = 8-dx
punpcklbw mm2, mm3
punpcklbw mm1, mm3
- paddw mm0, [pw_32 GLOBAL]
+ paddw mm0, [pw_32 GOT_ebx]
pmullw mm2, mm5 ; line * cB
pmullw mm1, mm7 ; line * cD
mov edi, [picesp + tdst1]
lea ebp, [picesp + tbuffer]
mov esi, [picesp + tsrc]
- movq mm7, [mmx_dw_one GLOBAL]
+ movq mm7, [mmx_dw_one GOT_ebx]
picpop ebx
paddw mm3, mm4
paddw mm1, mm6
- movq mm5, [mmx_dw_20 GLOBAL]
- movq mm4, [mmx_dw_5 GLOBAL]
+ movq mm5, [mmx_dw_20 GOT_ebx]
+ movq mm4, [mmx_dw_5 GOT_ebx]
movq mm6, mm1
pxor mm7, mm7
punpckhwd mm5, mm2
punpcklwd mm4, mm3
- punpcklwd mm2, [mmx_dw_20 GLOBAL]
- punpckhwd mm3, [mmx_dw_5 GLOBAL]
+ punpcklwd mm2, [mmx_dw_20 GOT_ebx]
+ punpckhwd mm3, [mmx_dw_5 GOT_ebx]
pcmpgtw mm7, mm1
paddd mm2, mm1
paddd mm3, mm6
- paddd mm2, [mmx_dd_one GLOBAL]
- paddd mm3, [mmx_dd_one GLOBAL]
+ paddd mm2, [mmx_dd_one GOT_ebx]
+ paddd mm3, [mmx_dd_one GOT_ebx]
psrad mm2, 10
psrad mm3, 10
pxor mm0, mm0
picpush ebx
picgetgot ebx
- movq mm7, [mmx_dw_one GLOBAL]
+ movq mm7, [mmx_dw_one GOT_ebx]
picpop ebx
mov ecx, [esp + 32] ; height
movdqa %2, %1
psrldq %1, 2
paddusw %1, %2
- pand %1, [pd_0000ffff GLOBAL]
+ pand %1, [pd_0000ffff GOT_ebx]
movdqa %2, %1
psrldq %1, 4
paddd %1, %2
pavgb mm1, mm2
pxor mm2, mm3
movq %1 , %2
- pand mm2, [pb_1 GLOBAL]
+ pand mm2, [pb_1 GOT_ebx]
psubusb mm1, mm2
pavgb %1 , mm1 ; %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%endmacro
pxor mm1, mm1
psadbw mm0, mm1
psadbw mm4, mm1
- paddw mm0, [pw_8 GLOBAL]
+ paddw mm0, [pw_8 GOT_ebx]
paddw mm0, mm4
psrlw mm0, 4
pshufw mm0, mm0, 0
paddw mm0, [picesp + 8]
pshufw mm2, [picesp + 12], 0
psrlw mm0, 3
- paddw mm1, [pw_2 GLOBAL]
+ paddw mm1, [pw_2 GOT_ebx]
movq mm3, mm2
pshufw mm1, mm1, 0
pshufw mm0, mm0, 0 ; dc0 (w)
pshufw mm2, [picesp +12], 0
pshufw mm4, [picesp +16], 0
movq mm1, mm2
- pmullw mm2, [pw_3210 GLOBAL]
+ pmullw mm2, [pw_3210 GOT_ebx]
psllw mm1, 2
paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
pshufw mm4, [picesp +16], 0
movq mm5, mm2
movq mm1, mm2
- pmullw mm5, [pw_3210 GLOBAL]
+ pmullw mm5, [pw_3210 GOT_ebx]
psllw mm2, 3
psllw mm1, 2
movq mm3, mm2
predict_16x16_dc_top_mmxext:
picpush ebx
picgetgot ebx
- PRED16x16_DC [pw_8 GLOBAL], 4, picesp
+ PRED16x16_DC [pw_8 GOT_ebx], 4, picesp
picpop ebx
ret
neg eax
picpush ebx
picgetgot ebx
- movq mm6, [pd_1 GLOBAL]
+ movq mm6, [pd_1 GOT_ebx]
picpop ebx
movd mm5, eax
pxor mm7, mm7