endstruc
%macro LOAD_GLOBAL 4
-%ifdef PIC64
+%ifdef PIC
; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
lea r11, [%2 GLOBAL]
%ifnidn %3, 0
add r11, %3
%endif
movzx %1, byte [r11+%4]
-%elifdef PIC32
- %ifnidn %3, 0
- lea %1, [%3+%4]
- movzx %1, byte [%2+%1 GLOBAL]
- %else
- movzx %1, byte [%2+%3+%4 GLOBAL]
- %endif
%else
movzx %1, byte [%2+%3+%4]
%endif
cglobal x264_cabac_encode_decision_asm, 0,7
movifnidn t0d, r0m
movifnidn t1d, r1m
- picgetgot t2
mov t5d, [r0+cb.range]
movzx t3d, byte [r0+cb.state+t1]
mov t4d, t5d
sub t4d, t5d
mov t6d, t3d
shr t6d, 6
-%ifdef PIC32
- cmp t6d, r2m
-%else
movifnidn t2d, r2m
cmp t6d, t2d
-%endif
mov t6d, [r0+cb.low]
lea t7, [t6+t4]
cmovne t4d, t5d
cmovne t6d, t7d
-%ifdef PIC32
- mov t1, r2m
- LOAD_GLOBAL t3d, x264_cabac_transition, t1, t3*2
-%else
LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
-%endif
movifnidn t1d, r1m
mov [r0+cb.state+t1], t3b
.renorm:
IDCT8_1D 0,1,2,3,4,5,6,7,r1
SPILL r1, 6
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
- picgetgot edx
paddw m0, [pw_32 GLOBAL]
SPILL r1, 0
IDCT8_1D 0,1,2,3,4,5,6,7,r1
;-----------------------------------------------------------------------------
; void x264_dct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_dct4x4dc_mmx, 1,1,1
+cglobal x264_dct4x4dc_mmx, 1,1
movq m0, [r0+ 0]
movq m1, [r0+ 8]
movq m2, [r0+16]
;-----------------------------------------------------------------------------
; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_add4x4_idct_mmx, 2,2,1
+cglobal x264_add4x4_idct_mmx, 2,2
.skip_prologue:
movq m0, [r1+ 0]
movq m1, [r1+ 8]
movhps [r0+56], m3
ret
-cglobal x264_add8x8_idct_sse2, 2,2,1
+cglobal x264_add8x8_idct_sse2, 2,2
.skip_prologue:
call .8x4
add r1, 64
; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6
-cglobal %1, 2,2,1
+cglobal %1, 2,2
.skip_prologue:
call %2
add r0, %4-%5-%6*FDEC_STRIDE
movd [r2+1*FDEC_STRIDE], xmm1
movd [r2+2*FDEC_STRIDE], xmm2
movd [r2+3*FDEC_STRIDE], xmm3
- picgetgot r1
punpckldq xmm0, xmm1
punpckldq xmm2, xmm3
punpckldq xmm4, xmm5
;-----------------------------------------------------------------------------
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_%1, 5,5,1
+cglobal x264_deblock_%2_luma_%1, 5,5
lea r4, [r1*3]
dec r2 ; alpha-1
neg r4
;-----------------------------------------------------------------------------
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_intra_%1, 4,6,1
+cglobal x264_deblock_%2_luma_intra_%1, 4,6
%ifndef ARCH_X86_64
sub esp, 0x60
%endif
movd m6, [r4] ; tc0
punpcklbw m6, m6
pand m7, m6
- picgetgot r4
DEBLOCK_P0_Q0
ret
LOAD_MASK r2d, r3d
movq m5, m1
movq m6, m2
- picgetgot r2
CHROMA_INTRA_P0 m1, m0, m3
CHROMA_INTRA_P0 m2, m3, m0
psubb m1, m5
%macro INIT_SHIFT 2
and eax, 7
shl eax, 3
-%ifdef PIC32
- ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx
- mov r2, 64
- sub r2, eax
- movd %2, eax
- movd %1, r2
-%else
movd %1, [sw_64 GLOBAL]
movd %2, eax
psubw %1, %2
-%endif
%endmacro
%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
INIT_SHIFT mm6, mm7
mov eax, r4m
INIT_SHIFT mm4, mm5
- PROLOGUE 6,6,0
+ PROLOGUE 6,6
and r2, ~7
and r4, ~7
sub r4, r2
%else
SPLATW m4, r4m
%endif
- picgetgot r4
mova m5, [pw_64 GLOBAL]
psubw m5, m4 ; weight_src
mova m6, [pw_32 GLOBAL] ; rounding
;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4,1
+cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4
BIWEIGHT_START 0
BIWEIGHT [r0 ], [r2 ]
BIWEIGHT [r0+r1 ], [r2+r3 ]
; int width, int height )
;-----------------------------------------------------------------------------
%macro MC_CHROMA 1
-cglobal x264_mc_chroma_%1, 0,6,1
+cglobal x264_mc_chroma_%1, 0,6
%if mmsize == 16
cmp dword r6m, 4
jle x264_mc_chroma_mmxext %+ .skip_prologue
MC_CHROMA sse2
INIT_MMX
-cglobal x264_mc_chroma_ssse3, 0,6,1
+cglobal x264_mc_chroma_ssse3, 0,6
MC_CHROMA_START
and r4d, 7
and r5d, 7
;-----------------------------------------------------------------------------
; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_v_%1, 5,6,1
+cglobal x264_hpel_filter_v_%1, 5,6
lea r5, [r1+r3]
sub r1, r3
sub r1, r3
;-----------------------------------------------------------------------------
; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_c_mmxext, 3,3,1
+cglobal x264_hpel_filter_c_mmxext, 3,3
add r0, r2
lea r1, [r1+r2*2]
neg r2
;-----------------------------------------------------------------------------
; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_mmxext, 3,3,1
+cglobal x264_hpel_filter_h_mmxext, 3,3
add r0, r2
add r1, r2
neg r2
;-----------------------------------------------------------------------------
; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_c_%1, 3,3,1
+cglobal x264_hpel_filter_c_%1, 3,3
add r0, r2
lea r1, [r1+r2*2]
neg r2
;-----------------------------------------------------------------------------
; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_sse2, 3,3,1
+cglobal x264_hpel_filter_h_sse2, 3,3
add r0, r2
add r1, r2
neg r2
%endmacro
%macro SATD_END_SSE2 0
- picgetgot ebx
psrlw m6, 1
HADDW m6, m7
movd eax, m6
paddusw m0, m7
paddusw m5, m4
pavgw m0, m5
- picgetgot ebx
HADDW m0, m7
movd eax, m0
mov ecx, eax ; preserve rounding for 16x16
%endrep
; PHADDW m1, m2
; PHADDD m3, m4
- picgetgot eax
movdqa m7, [pw_1 GLOBAL]
pshufd m5, m3, 0xb1
pmaddwd m1, m7
paddd m1, m2
paddd m2, m3
paddd m3, m4
- picgetgot r1
movdqa m5, [ssim_c1 GLOBAL]
movdqa m6, [ssim_c2 GLOBAL]
TRANSPOSE4x4D 0, 1, 2, 3, 4
cmp r2d, 4
je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
neg r2
-%ifdef PIC64
- lea r3, [mask_ff + 16 GLOBAL]
+%ifdef PIC
+ lea r3, [mask_ff + 16 GLOBAL]
movdqu m1, [r3 + r2*4]
%else
movdqu m1, [mask_ff + r2*4 + 16 GLOBAL]
;-----------------------------------------------------------------------------
; void predict_4x4_ddl_mmxext( uint8_t *src )
;-----------------------------------------------------------------------------
-cglobal predict_4x4_ddl_mmxext, 1,1,1
+cglobal predict_4x4_ddl_mmxext, 1,1
sub r0, FDEC_STRIDE
movq mm3, [r0]
movq mm1, [r0-1]
;-----------------------------------------------------------------------------
; void predict_4x4_vl_mmxext( uint8_t *src )
;-----------------------------------------------------------------------------
-cglobal predict_4x4_vl_mmxext, 1,1,1
+cglobal predict_4x4_vl_mmxext, 1,1
movq mm1, [r0-FDEC_STRIDE]
movq mm3, mm1
movq mm2, mm1
;-----------------------------------------------------------------------------
; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
-cglobal predict_8x8_dc_mmxext, 2,2,1
+cglobal predict_8x8_dc_mmxext, 2,2
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r1+7]
; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
%macro PRED8x8_DC 2
-cglobal %1, 2,2,1
+cglobal %1, 2,2
pxor mm0, mm0
psadbw mm0, [r1+%2]
paddw mm0, [pw_4 GLOBAL]
;-----------------------------------------------------------------------------
; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl_mmxext, 2,2,1
+cglobal predict_8x8_ddl_mmxext, 2,2
movq mm5, [r1+16]
movq mm2, [r1+17]
movq mm3, [r1+23]
;-----------------------------------------------------------------------------
; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddr_mmxext, 2,2,1
+cglobal predict_8x8_ddr_mmxext, 2,2
movq mm1, [r1+7]
movq mm2, [r1+9]
movq mm3, [r1+15]
;-----------------------------------------------------------------------------
; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl_sse2, 2,2,1
+cglobal predict_8x8_ddl_sse2, 2,2
movdqa xmm3, [r1+16]
movdqu xmm2, [r1+17]
movdqa xmm1, xmm3
;-----------------------------------------------------------------------------
; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddr_sse2, 2,2,1
+cglobal predict_8x8_ddr_sse2, 2,2
movdqu xmm3, [r1+8]
movdqu xmm1, [r1+7]
movdqa xmm2, xmm3
;-----------------------------------------------------------------------------
; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_vl_sse2, 2,2,1
+cglobal predict_8x8_vl_sse2, 2,2
movdqa xmm4, [r1+16]
movdqa xmm2, xmm4
movdqa xmm1, xmm4
; 6 .....
; 7 ,,,,,
-cglobal predict_8x8_vr_core_mmxext, 2,2,1
+cglobal predict_8x8_vr_core_mmxext, 2,2
movq mm2, [r1+16]
movq mm3, [r1+15]
movq mm1, [r1+14]
;-----------------------------------------------------------------------------
; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
;-----------------------------------------------------------------------------
-cglobal predict_8x8c_dc_core_mmxext, 1,1,1
+cglobal predict_8x8c_dc_core_mmxext, 1,1
movq mm0, [r0 - FDEC_STRIDE]
pxor mm1, mm1
pxor mm2, mm2
;-----------------------------------------------------------------------------
; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-cglobal predict_8x8c_p_core_mmxext, 1,2,1
+cglobal predict_8x8c_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
movq mm1, mm2
pmullw mm2, [pw_3210 GLOBAL]
;-----------------------------------------------------------------------------
; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-cglobal predict_16x16_p_core_mmxext, 1,2,1
+cglobal predict_16x16_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
movq mm5, mm2
movq mm1, mm2
;-----------------------------------------------------------------------------
; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-cglobal predict_16x16_p_core_sse2, 1,2,1
+cglobal predict_16x16_p_core_sse2, 1,2
movd xmm0, r1m
movd xmm1, r2m
movd xmm2, r3m
%endif
REP_RET
-cglobal predict_16x16_dc_top_mmxext, 1,2,1
+cglobal predict_16x16_dc_top_mmxext, 1,2
PRED16x16_DC [pw_8 GLOBAL], 4
REP_RET
PRED16x16_DC_SSE2 xmm2, 5
REP_RET
-cglobal predict_16x16_dc_top_sse2, 1,2,1
+cglobal predict_16x16_dc_top_sse2, 1,2
PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4
REP_RET
.rshift32:
neg t0d
movd m5, t0d
- picgetgot t0d
mova m6, [pd_1 GLOBAL]
pxor m7, m7
pslld m6, m5
sub t2d, t1d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %3
-%ifdef PIC64
+%ifdef PIC
lea r1, [dequant%2_scale GLOBAL]
add r1, t2
%else
- picgetgot r0
- lea r1, [t2 + dequant%2_scale GLOBAL]
+ lea r1, [dequant%2_scale + t2 GLOBAL]
%endif
movifnidn r0d, r0m
movd m7, t0d
and eax, 0x37
cmp eax, 0x30
jle x264_pixel_sad_16x%2_sse2
- PROLOGUE 4,6,0
+ PROLOGUE 4,6
mov r4d, r2d
and r4d, 15
%ifidn %1, ssse3
shl r4d, 4 ; code size = 80
%endif
%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
-%ifdef PIC64
+%ifdef PIC
lea r5, [sad_w16_addr GLOBAL]
add r5, r4
%else
- picgetgot r5
lea r5, [sad_w16_addr + r4 GLOBAL]
%endif
and r2, ~15
jle x264_pixel_sad_%1x%2_mmxext
and eax, 7
shl eax, 3
-%ifdef PIC32
- ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx
- mov r2, 64
- sub r2, eax
- movd mm7, eax
- movd mm6, r2
-%else
movd mm6, [sw_64 GLOBAL]
movd mm7, eax
psubw mm6, mm7
-%endif
- PROLOGUE 4,5,0
+ PROLOGUE 4,5
and r2, ~7
mov r4d, %3
pxor mm0, mm0
%endif
%endmacro
-; PIC support macros. All these macros are totally harmless when PIC is
-; not defined but can ruin everything if misused in PIC mode. On x86_32, shared
-; objects cannot directly access global variables by address, they need to
-; go through the GOT (global offset table). Most OSes do not care about it
-; and let you load non-shared .so objects (Linux, Win32...). However, OS X
-; requires PIC code in its .dylib objects.
-;
-; - GLOBAL should be used as a suffix for global addressing, eg.
-; picgetgot ebx
+; PIC support macros.
+; x86_64 can't fit 64bit address literals in most instruction types,
+; so shared objects (under the assumption that they might be anywhere
+; in memory) must use an address mode that does fit.
+; So all accesses to global variables must use this macro, e.g.
; mov eax, [foo GLOBAL]
-; instead of
+; instead of
; mov eax, [foo]
;
-; - picgetgot computes the GOT address into the given register in PIC
-; mode, otherwise does nothing. You need to do this before using GLOBAL.
-; Before in both execution order and compiled code order (so GLOBAL knows
-; which register the GOT is in).
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
-%ifndef PIC
- %define GLOBAL
- %macro picgetgot 1
- %endmacro
-%elifdef ARCH_X86_64
- %define PIC64
+%ifndef ARCH_X86_64
+ %undef PIC
+%endif
+%ifdef PIC
%define GLOBAL wrt rip
- %macro picgetgot 1
- %endmacro
%else
- %define PIC32
- %ifidn __OUTPUT_FORMAT__,macho
- ; There is no real global offset table on OS X, but we still
- ; need to reference our variables by offset.
- %macro picgetgot 1
- call %%getgot
- %%getgot:
- pop %1
- add %1, $$ - %%getgot
- %undef GLOBAL
- %define GLOBAL + %1 - fakegot
- %endmacro
- %else ; elf
- extern _GLOBAL_OFFSET_TABLE_
- %macro picgetgot 1
- call %%getgot
- %%getgot:
- pop %1
- add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
- %undef GLOBAL
- %define GLOBAL + %1 wrt ..gotoff
- %endmacro
- %endif
+ %define GLOBAL
%endif
; Macros to eliminate most code duplication between x86_32 and x86_64:
; PROLOGUE:
; %1 = number of arguments. loads them from stack if needed.
-; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed.
-; %3 = whether global constants are used in this function. inits x86_32 PIC if needed.
-; %4 = list of names to define to registers
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = list of names to define to registers
; PROLOGUE can also be invoked by adding the same options to cglobal
; e.g.
-; cglobal foo, 2,3,0, dst, src, tmp
-; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals
+; cglobal foo, 2,3, dst, src, tmp
+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
; TODO Some functions can use some args directly from the stack. If they're the
; last args then you can just not declare them, but if they're in the middle
%endif
%endmacro
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
+%macro PROLOGUE 2-3+ ; #args, #regs, arg_names...
ASSERT %2 >= %1
ASSERT %2 <= 7
%assign stack_offset 0
LOAD_IF_USED 6, %1
- DEFINE_ARGS %4
+ DEFINE_ARGS %3
%endmacro
%macro RET 0
%endif
%endmacro
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
+%macro PROLOGUE 2-3+ ; #args, #regs, arg_names...
ASSERT %2 >= %1
%assign stack_offset 0
%assign regs_used %2
- %ifdef PIC
- %if %3
- %assign regs_used regs_used+1
- %endif
- %endif
ASSERT regs_used <= 7
PUSH_IF_USED 3
PUSH_IF_USED 4
LOAD_IF_USED 4, %1
LOAD_IF_USED 5, %1
LOAD_IF_USED 6, %1
- %if %3
- picgetgot r%2
- %endif
- DEFINE_ARGS %4
+ DEFINE_ARGS %3
%endmacro
%macro RET 0