%ifdef WIN64
%define %1 pad %1
%endif
+ align 16
+ %1:
%endmacro
%macro pad 1
SECTION .text
-cglobal x264_cpu_cpuid_test
-cglobal x264_cpu_cpuid
-cglobal x264_emms
-
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_cpu_cpuid_test( void ) return 0 if unsupported
;-----------------------------------------------------------------------------
-x264_cpu_cpuid_test:
+cglobal x264_cpu_cpuid_test
firstpush rbx
pushreg rbx
push rbp
ret
endfunc
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
-x264_cpu_cpuid:
+cglobal x264_cpu_cpuid
firstpush rbx
pushreg rbx
endprolog
ret
endfunc
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_emms( void )
;-----------------------------------------------------------------------------
-x264_emms:
+cglobal x264_emms
emms
ret
SECTION .text
-cglobal x264_dct4x4dc_mmx
-
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_dct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
-x264_dct4x4dc_mmx:
+cglobal x264_dct4x4dc_mmx
movq mm0, [parm1q+ 0]
movq mm1, [parm1q+ 8]
movq mm2, [parm1q+16]
movq [parm1q+24],mm4
ret
-cglobal x264_idct4x4dc_mmx
-
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_idct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
-x264_idct4x4dc_mmx:
+cglobal x264_idct4x4dc_mmx
movq mm0, [parm1q+ 0]
movq mm1, [parm1q+ 8]
movq mm2, [parm1q+16]
movq [parm1q+24], mm4
ret
-cglobal x264_sub4x4_dct_mmx
-
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-x264_sub4x4_dct_mmx:
+cglobal x264_sub4x4_dct_mmx
MMX_ZERO mm7
; Load 4 lines
movq [parm1q+24], mm0
ret
-cglobal x264_add4x4_idct_mmx
-
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-x264_add4x4_idct_mmx:
+cglobal x264_add4x4_idct_mmx
; Load dct coeffs
movq mm0, [parm2q+ 0] ; dct
movq mm1, [parm2q+ 8]
psubw %4, %1 ; %4=b5
%endmacro
-cglobal x264_sub8x8_dct8_sse2
-
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-x264_sub8x8_dct8_sse2:
+cglobal x264_sub8x8_dct8_sse2
MMX_ZERO xmm9
MMX_LOAD_DIFF_8P xmm0, xmm8, xmm9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE]
MMX_SUMSUB_BA %4, %5 ; %4=c3, %5=c4
%endmacro
-cglobal x264_add8x8_idct8_sse2
-
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-x264_add8x8_idct8_sse2:
+cglobal x264_add8x8_idct8_sse2
movdqa xmm0, [parm2q+0x00]
movdqa xmm1, [parm2q+0x10]
movdqa xmm2, [parm2q+0x20]
; uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
-ALIGN 16
cglobal %1
-%1:
call %2
add parm1q, %3
add parm2q, %4-%5*FENC_STRIDE
; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6
-ALIGN 16
cglobal %1
-%1:
call %2
add parm1q, %4-%5*FDEC_STRIDE
add parm2q, %3
;-----------------------------------------------------------------------------
; void __cdecl x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-ALIGN 16
cglobal x264_zigzag_scan_4x4_field_sse2
-x264_zigzag_scan_4x4_field_sse2:
punpcklwd xmm0, [parm2q]
punpckhwd xmm1, [parm2q]
punpcklwd xmm2, [parm2q+16]
pb_a1: times 16 db 0xa1
SECTION .text
-cglobal x264_deblock_v_luma_sse2
-cglobal x264_deblock_h_luma_sse2
-cglobal x264_deblock_v_chroma_mmxext
-cglobal x264_deblock_h_chroma_mmxext
-cglobal x264_deblock_v_chroma_intra_mmxext
-cglobal x264_deblock_h_chroma_intra_mmxext
; expands to [base],...,[base+7*stride]
%define PASS8ROWS(base, base3, stride, stride3) \
SECTION .text
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-x264_deblock_v_luma_sse2:
+cglobal x264_deblock_v_luma_sse2
; rdi = pix
movsxd rsi, esi ; stride
dec edx ; alpha-1
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-x264_deblock_h_luma_sse2:
+cglobal x264_deblock_h_luma_sse2
movsxd r10, esi
lea r11, [r10+r10*2]
lea rax, [rdi-4]
add rdi, r9
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-x264_deblock_v_chroma_mmxext:
+cglobal x264_deblock_v_chroma_mmxext
CHROMA_V_START
movq mm0, [rax]
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-x264_deblock_h_chroma_mmxext:
+cglobal x264_deblock_h_chroma_mmxext
CHROMA_H_START
TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
paddb mm2, mm6
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-x264_deblock_v_chroma_intra_mmxext:
+cglobal x264_deblock_v_chroma_intra_mmxext
CHROMA_V_START
movq mm0, [rax]
movq [rdi], mm2
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-x264_deblock_h_chroma_intra_mmxext:
+cglobal x264_deblock_h_chroma_intra_mmxext
CHROMA_H_START
TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
CHROMA_INTRA_BODY
SECTION .text
-cglobal x264_pixel_avg_w4_mmxext
-cglobal x264_pixel_avg_w8_mmxext
-cglobal x264_pixel_avg_w16_mmxext
-cglobal x264_pixel_avg_w20_mmxext
-cglobal x264_pixel_avg_w16_sse2
-
-cglobal x264_pixel_avg_weight_4x4_mmxext
-cglobal x264_pixel_avg_weight_w8_mmxext
-cglobal x264_pixel_avg_weight_w16_mmxext
-
-cglobal x264_mc_copy_w4_mmx
-cglobal x264_mc_copy_w8_mmx
-cglobal x264_mc_copy_w16_mmx
-cglobal x264_mc_copy_w16_sse2
-
-cglobal x264_mc_chroma_mmxext
-
-cglobal x264_prefetch_fenc_mmxext
-cglobal x264_prefetch_ref_mmxext
-
;=============================================================================
; pixel avg
;=============================================================================
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src1, int i_src1_stride,
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
-x264_pixel_avg_w4_mmxext:
+cglobal x264_pixel_avg_w4_mmxext
mov r10, parm5q ; src2
movsxd r11, parm6d ; i_src2_stride
mov eax, parm7d ; i_height
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w8_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src1, int i_src1_stride,
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
-x264_pixel_avg_w8_mmxext:
+cglobal x264_pixel_avg_w8_mmxext
mov r10, parm5q ; src2
movsxd r11, parm6d ; i_src2_stride
mov eax, parm7d ; i_height
jg .height_loop
rep ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w16_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src1, int i_src1_stride,
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
-x264_pixel_avg_w16_mmxext:
+cglobal x264_pixel_avg_w16_mmxext
mov r10, parm5q ; src2
movsxd r11, parm6d ; i_src2_stride
mov eax, parm7d ; i_height
jg .height_loop
rep ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w20_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src1, int i_src1_stride,
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
-x264_pixel_avg_w20_mmxext:
+cglobal x264_pixel_avg_w20_mmxext
mov r10, parm5q ; src2
movsxd r11, parm6d ; i_src2_stride
mov eax, parm7d ; i_height
jg .height_loop
rep ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w16_sse2( uint8_t *dst, int i_dst_stride,
; uint8_t *src1, int i_src1_stride,
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
-x264_pixel_avg_w16_sse2:
+cglobal x264_pixel_avg_w16_sse2
mov r10, parm5q ; src2
movsxd r11, parm6d ; i_src2_stride
mov eax, parm7d ; i_height
.height_loop
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int )
;-----------------------------------------------------------------------------
-x264_pixel_avg_weight_w16_mmxext:
+cglobal x264_pixel_avg_weight_w16_mmxext
BIWEIGHT_START_MMX
BIWEIGHT_4P_MMX [parm1q ], [parm3q ]
jg .height_loop
rep ret
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int )
;-----------------------------------------------------------------------------
-x264_pixel_avg_weight_w8_mmxext:
+cglobal x264_pixel_avg_weight_w8_mmxext
BIWEIGHT_START_MMX
BIWEIGHT_4P_MMX [parm1q ], [parm3q ]
jg .height_loop
rep ret
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int )
;-----------------------------------------------------------------------------
-x264_pixel_avg_weight_4x4_mmxext:
+cglobal x264_pixel_avg_weight_4x4_mmxext
BIWEIGHT_START_MMX
BIWEIGHT_4P_MMX [parm1q ], [parm3q ]
BIWEIGHT_4P_MMX [parm1q+parm2q ], [parm3q+parm4q ]
; pixel copy
;=============================================================================
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride,
; uint8_t *src, int i_src_stride, int i_height )
;-----------------------------------------------------------------------------
-x264_mc_copy_w4_mmx:
+cglobal x264_mc_copy_w4_mmx
mov eax, parm5d ; i_height
ALIGN 4
jg .height_loop
rep ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_mc_copy_w8_mmx( uint8_t *dst, int i_dst_stride,
; uint8_t *src, int i_src_stride, int i_height )
;-----------------------------------------------------------------------------
-x264_mc_copy_w8_mmx:
+cglobal x264_mc_copy_w8_mmx
mov eax, parm5d ; i_height
lea r10, [parm4q+parm4q*2] ; 3 * i_src_stride
jg .height_loop
rep ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_mc_copy_w16_mmx( uint8_t *dst, int i_dst_stride,
; uint8_t *src, int i_src_stride, int i_height )
;-----------------------------------------------------------------------------
-x264_mc_copy_w16_mmx:
+cglobal x264_mc_copy_w16_mmx
mov eax, parm5d ; i_height
lea r10, [parm4q+parm4q*2] ; 3 * i_src_stride
rep ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_mc_copy_w16_sse2( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, int i_height )
;-----------------------------------------------------------------------------
-x264_mc_copy_w16_sse2:
+cglobal x264_mc_copy_w16_sse2
mov eax, parm5d ; i_height
ALIGN 4
; chroma MC
;=============================================================================
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
; uint8_t *dst, int i_dst_stride,
; int dx, int dy,
; int i_width, int i_height )
;-----------------------------------------------------------------------------
-
-x264_mc_chroma_mmxext:
+cglobal x264_mc_chroma_mmxext
mov r10d, parm6d
mov r11d, parm5d
sar r10d, 3
; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
; uint8_t *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
-ALIGN 16
-x264_prefetch_fenc_mmxext:
+cglobal x264_prefetch_fenc_mmxext
mov eax, parm5d
and eax, 3
imul eax, parm2d
;-----------------------------------------------------------------------------
; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
;-----------------------------------------------------------------------------
-ALIGN 16
-x264_prefetch_ref_mmxext:
+cglobal x264_prefetch_ref_mmxext
dec parm3d
and parm3d, parm2d
lea parm1q, [parm1q+parm3q*8+64]
SECTION .text
-cglobal x264_hpel_filter_mmxext
-cglobal x264_plane_copy_mmxext
-
;-----------------------------------------------------------------------------
; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
; int i_stride, int i_width, int i_height );
;-----------------------------------------------------------------------------
-
-ALIGN 16
-x264_hpel_filter_mmxext :
+cglobal x264_hpel_filter_mmxext
%ifdef WIN64
push rdi
; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
; uint8_t *src, int i_src, int w, int h)
;-----------------------------------------------------------------------------
-ALIGN 16
-x264_plane_copy_mmxext:
+cglobal x264_plane_copy_mmxext
movsxd parm2q, parm2d
movsxd parm4q, parm4d
add parm5d, 3
SECTION .text
-cglobal x264_pixel_sad_16x16_mmxext
-cglobal x264_pixel_sad_16x8_mmxext
-cglobal x264_pixel_sad_8x16_mmxext
-cglobal x264_pixel_sad_8x8_mmxext
-cglobal x264_pixel_sad_8x4_mmxext
-cglobal x264_pixel_sad_4x8_mmxext
-cglobal x264_pixel_sad_4x4_mmxext
-
-cglobal x264_pixel_sad_x3_16x16_mmxext
-cglobal x264_pixel_sad_x3_16x8_mmxext
-cglobal x264_pixel_sad_x3_8x16_mmxext
-cglobal x264_pixel_sad_x3_8x8_mmxext
-cglobal x264_pixel_sad_x3_8x4_mmxext
-cglobal x264_pixel_sad_x3_4x8_mmxext
-cglobal x264_pixel_sad_x3_4x4_mmxext
-
-cglobal x264_pixel_sad_x4_16x16_mmxext
-cglobal x264_pixel_sad_x4_16x8_mmxext
-cglobal x264_pixel_sad_x4_8x16_mmxext
-cglobal x264_pixel_sad_x4_8x8_mmxext
-cglobal x264_pixel_sad_x4_8x4_mmxext
-cglobal x264_pixel_sad_x4_4x8_mmxext
-cglobal x264_pixel_sad_x4_4x4_mmxext
-
-cglobal x264_pixel_sad_pde_16x16_mmxext
-cglobal x264_pixel_sad_pde_16x8_mmxext
-cglobal x264_pixel_sad_pde_8x16_mmxext
-
-cglobal x264_pixel_ssd_16x16_mmx
-cglobal x264_pixel_ssd_16x8_mmx
-cglobal x264_pixel_ssd_8x16_mmx
-cglobal x264_pixel_ssd_8x8_mmx
-cglobal x264_pixel_ssd_8x4_mmx
-cglobal x264_pixel_ssd_4x8_mmx
-cglobal x264_pixel_ssd_4x4_mmx
-
-cglobal x264_pixel_satd_4x4_mmxext
-cglobal x264_pixel_satd_4x8_mmxext
-cglobal x264_pixel_satd_8x4_mmxext
-cglobal x264_pixel_satd_8x8_mmxext
-cglobal x264_pixel_satd_16x8_mmxext
-cglobal x264_pixel_satd_8x16_mmxext
-cglobal x264_pixel_satd_16x16_mmxext
-
-cglobal x264_intra_satd_x3_4x4_mmxext
-cglobal x264_intra_satd_x3_8x8c_mmxext
-cglobal x264_intra_satd_x3_16x16_mmxext
-
-cglobal x264_pixel_ads4_mmxext
-cglobal x264_pixel_ads2_mmxext
-cglobal x264_pixel_ads1_mmxext
-
-
%macro SAD_START 0
pxor mm0, mm0
%endmacro
; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SAD 2
-ALIGN 16
-x264_pixel_sad_%1x%2_mmxext:
+cglobal x264_pixel_sad_%1x%2_mmxext
SAD_START
%rep %2/2
SAD_INC_2x%1P
; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
-ALIGN 16
-x264_pixel_sad_x%1_%2x%3_mmxext:
+cglobal x264_pixel_sad_x%1_%2x%3_mmxext
SAD_X%1_2x%2P 1
%rep %3/2-1
SAD_X%1_2x%2P 0
; int x264_pixel_sad_pde_16x16_mmxext (uint8_t *, int, uint8_t *, int, int )
;-----------------------------------------------------------------------------
%macro SAD_PDE 2
-ALIGN 16
-x264_pixel_sad_pde_%1x%2_mmxext:
+cglobal x264_pixel_sad_pde_%1x%2_mmxext
SAD_START
%rep %2/4
SAD_INC_2x%1P
; int x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SSD 2
-ALIGN 16
-x264_pixel_ssd_%1x%2_mmx:
+cglobal x264_pixel_ssd_%1x%2_mmx
SSD_START
%rep %2
SSD_INC_1x%1P
ret
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_4x4_mmxext:
+cglobal x264_pixel_satd_4x4_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_4x8_mmxext:
+cglobal x264_pixel_satd_4x8_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 1
LOAD_DIFF_HADAMARD_SUM mm1, 0, 0
paddw mm0, mm1
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x4_mmxext:
+cglobal x264_pixel_satd_8x4_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
paddw mm0, mm1
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x8_mmxext:
+cglobal x264_pixel_satd_8x8_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
LOAD_DIFF_HADAMARD_SUM mm1, 4, 1
paddw mm0, mm1
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_16x8_mmxext:
+cglobal x264_pixel_satd_16x8_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
paddw mm0, mm1
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x16_mmxext:
+cglobal x264_pixel_satd_8x16_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
LOAD_DIFF_HADAMARD_SUM mm1, 4, 1
paddw mm0, mm1
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_16x16_mmxext:
+cglobal x264_pixel_satd_16x16_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
%8 %3, %6
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-x264_intra_satd_x3_4x4_mmxext:
+cglobal x264_intra_satd_x3_4x4_mmxext
%define top_1d rsp-8 ; +8
%define left_1d rsp-16 ; +8
call load_hadamard
movd [parm3q+8], mm5 ; i4x4_dc satd
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-x264_intra_satd_x3_16x16_mmxext:
+cglobal x264_intra_satd_x3_16x16_mmxext
%define sums rsp-32 ; +24
%define top_1d rsp-64 ; +32
%define left_1d rsp-96 ; +32
movd [parm3q+0], mm0 ; i16x16_v satd
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-x264_intra_satd_x3_8x8c_mmxext:
+cglobal x264_intra_satd_x3_8x8c_mmxext
%define sums rsp-32 ; +24
%define top_1d rsp-48 ; +16
%define left_1d rsp-64 ; +16
; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *res, int width )
;-----------------------------------------------------------------------------
-ALIGN 16
-x264_pixel_ads4_mmxext:
+cglobal x264_pixel_ads4_mmxext
movq mm6, [parm1q]
movq mm4, [parm1q+8]
pshufw mm7, mm6, 0
nop
ret
-ALIGN 16
-x264_pixel_ads2_mmxext:
+cglobal x264_pixel_ads2_mmxext
movq mm6, [parm1q]
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
nop
ret
-ALIGN 16
-x264_pixel_ads1_mmxext:
+cglobal x264_pixel_ads1_mmxext
pshufw mm7, [parm1q], 0
.loop:
movq mm0, [parm2q]
SECTION .text
-
-cglobal x264_pixel_sad_16x16_sse2
-cglobal x264_pixel_sad_16x8_sse2
-cglobal x264_pixel_ssd_16x16_sse2
-cglobal x264_pixel_ssd_16x8_sse2
-cglobal x264_pixel_satd_8x4_sse2
-cglobal x264_pixel_satd_8x8_sse2
-cglobal x264_pixel_satd_16x8_sse2
-cglobal x264_pixel_satd_8x16_sse2
-cglobal x264_pixel_satd_16x16_sse2
-cglobal x264_pixel_satd_8x4_ssse3
-cglobal x264_pixel_satd_8x8_ssse3
-cglobal x264_pixel_satd_16x8_ssse3
-cglobal x264_pixel_satd_8x16_ssse3
-cglobal x264_pixel_satd_16x16_ssse3
-cglobal x264_pixel_sa8d_8x8_sse2
-cglobal x264_pixel_sa8d_16x16_sse2
-cglobal x264_pixel_sa8d_8x8_ssse3
-cglobal x264_pixel_sa8d_16x16_ssse3
-cglobal x264_intra_sa8d_x3_8x8_core_sse2
-cglobal x264_pixel_ssim_4x4x2_core_sse2
-cglobal x264_pixel_ssim_end4_sse2
-
-
%macro HADDD 2 ; sum junk
movhlps %2, %1
paddd %1, %2
ret
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_sad_16x16_sse2:
+cglobal x264_pixel_sad_16x16_sse2
movdqu xmm0, [rdx]
movdqu xmm1, [rdx+rcx]
lea rdx, [rdx+2*rcx]
paddw xmm0, xmm7
SAD_END_SSE2
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_sad_16x8_sse2:
+cglobal x264_pixel_sad_16x8_sse2
pxor xmm0, xmm0
SAD_INC_4x16P_SSE2
SAD_INC_4x16P_SSE2
ret
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_ssd_16x16_sse2:
+cglobal x264_pixel_ssd_16x16_sse2
SSD_START_SSE2
%rep 8
SSD_INC_2x16P_SSE2
%endrep
SSD_END_SSE2
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_ssd_16x8_sse2:
+cglobal x264_pixel_ssd_16x8_sse2
SSD_START_SSE2
%rep 4
SSD_INC_2x16P_SSE2
%endmacro
%macro SATDS 1
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_16x16_%1:
+cglobal x264_pixel_satd_16x16_%1
SATD_START
mov r8, rdi
mov r9, rdx
SATD_TWO_SSE2
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x16_%1:
+cglobal x264_pixel_satd_8x16_%1
SATD_START
SATD_TWO_SSE2
SATD_TWO_SSE2
SATD_TWO_SSE2
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_16x8_%1:
+cglobal x264_pixel_satd_16x8_%1
SATD_START
mov r8, rdi
mov r9, rdx
SATD_TWO_SSE2
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x8_%1:
+cglobal x264_pixel_satd_8x8_%1
SATD_START
SATD_TWO_SSE2
SATD_TWO_SSE2
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x4_%1:
+cglobal x264_pixel_satd_8x4_%1
SATD_START
SATD_TWO_SSE2
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_sa8d_8x8_%1:
+cglobal x264_pixel_sa8d_8x8_%1
lea r10, [3*parm2q]
lea r11, [3*parm4q]
LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm8, xmm8
shr eax, 1
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
;; violates calling convention
-x264_pixel_sa8d_16x16_%1:
+cglobal x264_pixel_sa8d_16x16_%1
xor r8d, r8d
call x264_pixel_sa8d_8x8_%1 ; pix[0]
lea parm1q, [parm1q+4*parm2q]
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
-x264_intra_sa8d_x3_8x8_core_sse2:
+cglobal x264_intra_sa8d_x3_8x8_core_sse2
; 8x8 hadamard
pxor xmm4, xmm4
movq xmm0, [parm1q+0*FENC_STRIDE]
; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
-ALIGN 16
-x264_pixel_ssim_4x4x2_core_sse2:
+cglobal x264_pixel_ssim_4x4x2_core_sse2
pxor xmm0, xmm0
pxor xmm1, xmm1
pxor xmm2, xmm2
;-----------------------------------------------------------------------------
; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
-ALIGN 16
-x264_pixel_ssim_end4_sse2:
+cglobal x264_pixel_ssim_end4_sse2
movdqa xmm0, [parm1q+ 0]
movdqa xmm1, [parm1q+16]
movdqa xmm2, [parm1q+32]
SECTION .text
-cglobal predict_4x4_ddl_mmxext
-cglobal predict_4x4_vl_mmxext
-cglobal predict_8x8_v_mmxext
-cglobal predict_8x8_dc_mmxext
-cglobal predict_8x8_dc_top_mmxext
-cglobal predict_8x8_dc_left_mmxext
-cglobal predict_8x8_ddl_mmxext
-cglobal predict_8x8_ddl_sse2
-cglobal predict_8x8_ddr_sse2
-cglobal predict_8x8_vl_sse2
-cglobal predict_8x8_vr_core_mmxext
-cglobal predict_8x8c_v_mmx
-cglobal predict_8x8c_dc_core_mmxext
-cglobal predict_8x8c_p_core_mmxext
-cglobal predict_16x16_p_core_mmxext
-cglobal predict_16x16_v_mmx
-cglobal predict_16x16_dc_core_mmxext
-cglobal predict_16x16_dc_top_mmxext
-
-
; dest, left, right, src, tmp
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED8x8_LOWPASS0 6
;-----------------------------------------------------------------------------
; void predict_4x4_ddl_mmxext( uint8_t *src )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_4x4_ddl_mmxext:
+cglobal predict_4x4_ddl_mmxext
sub parm1q, FDEC_STRIDE
movq mm3, [parm1q]
movq mm1, [parm1q-1]
;-----------------------------------------------------------------------------
; void predict_4x4_vl_mmxext( uint8_t *src )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_4x4_vl_mmxext:
+cglobal predict_4x4_vl_mmxext
movq mm1, [parm1q-FDEC_STRIDE]
movq mm3, mm1
movq mm2, mm1
;-----------------------------------------------------------------------------
; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8_v_mmxext:
+cglobal predict_8x8_v_mmxext
movq mm0, [parm2q+16]
STORE8x8 mm0, mm0
ret
;-----------------------------------------------------------------------------
; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8_dc_mmxext:
+cglobal predict_8x8_dc_mmxext
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [parm2q+7]
;-----------------------------------------------------------------------------
; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8_dc_top_mmxext:
+cglobal predict_8x8_dc_top_mmxext
pxor mm0, mm0
psadbw mm0, [parm2q+16]
paddw mm0, [pw_4 GLOBAL]
;-----------------------------------------------------------------------------
; void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8_dc_left_mmxext:
+cglobal predict_8x8_dc_left_mmxext
pxor mm0, mm0
psadbw mm0, [parm2q+7]
paddw mm0, [pw_4 GLOBAL]
;-----------------------------------------------------------------------------
; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8_ddl_mmxext:
+cglobal predict_8x8_ddl_mmxext
movq mm5, [parm2q+16]
movq mm2, [parm2q+17]
movq mm3, [parm2q+23]
;-----------------------------------------------------------------------------
; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8_ddl_sse2:
+cglobal predict_8x8_ddl_sse2
movdqa xmm3, [parm2q+16]
movdqu xmm2, [parm2q+17]
movdqa xmm1, xmm3
;-----------------------------------------------------------------------------
; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8_ddr_sse2:
+cglobal predict_8x8_ddr_sse2
movdqu xmm3, [parm2q+8]
movdqu xmm1, [parm2q+7]
movdqa xmm2, xmm3
;-----------------------------------------------------------------------------
; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8_vl_sse2:
+cglobal predict_8x8_vl_sse2
movdqa xmm4, [parm2q+16]
movdqa xmm2, xmm4
movdqa xmm1, xmm4
; 6 .....
; 7 ,,,,,
-ALIGN 16
-predict_8x8_vr_core_mmxext:
+cglobal predict_8x8_vr_core_mmxext
movq mm2, [parm2q+16]
movq mm3, [parm2q+15]
movq mm1, [parm2q+14]
;-----------------------------------------------------------------------------
; void predict_8x8c_v_mmx( uint8_t *src )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8c_v_mmx :
+cglobal predict_8x8c_v_mmx
movq mm0, [parm1q - FDEC_STRIDE]
STORE8x8 mm0, mm0
ret
;-----------------------------------------------------------------------------
; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8c_dc_core_mmxext:
+cglobal predict_8x8c_dc_core_mmxext
movq mm0, [parm1q - FDEC_STRIDE]
pxor mm1, mm1
pxor mm2, mm2
;-----------------------------------------------------------------------------
; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8c_p_core_mmxext:
+cglobal predict_8x8c_p_core_mmxext
movd mm0, parm2d
movd mm2, parm3d
movd mm4, parm4d
;-----------------------------------------------------------------------------
; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_16x16_p_core_mmxext:
+cglobal predict_16x16_p_core_mmxext
movd mm0, parm2d
movd mm2, parm3d
movd mm4, parm4d
;-----------------------------------------------------------------------------
; void predict_16x16_v_mmx( uint8_t *src )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_16x16_v_mmx :
+cglobal predict_16x16_v_mmx
sub parm1q, FDEC_STRIDE
movq mm0, [parm1q]
movq mm1, [parm1q + 8]
STORE16x16 mm0, mm0
%endmacro
-ALIGN 16
-predict_16x16_dc_core_mmxext:
+cglobal predict_16x16_dc_core_mmxext
movd mm2, parm2d
PRED16x16_DC mm2, 5
ret
-ALIGN 16
-predict_16x16_dc_top_mmxext:
+cglobal predict_16x16_dc_top_mmxext
PRED16x16_DC [pw_8 GLOBAL], 4
ret
SECTION .text
-cglobal x264_quant_2x2_dc_core15_mmx
-cglobal x264_quant_4x4_dc_core15_mmx
-cglobal x264_quant_4x4_core15_mmx
-cglobal x264_quant_8x8_core15_mmx
-
-cglobal x264_quant_4x4_dc_core15_ssse3
-cglobal x264_quant_4x4_core15_ssse3
-cglobal x264_quant_8x8_core15_ssse3
-
-cglobal x264_quant_2x2_dc_core16_mmxext
-cglobal x264_quant_4x4_dc_core16_mmxext
-cglobal x264_quant_4x4_core16_mmxext
-cglobal x264_quant_8x8_core16_mmxext
-
-cglobal x264_quant_2x2_dc_core32_mmxext
-cglobal x264_quant_4x4_dc_core32_mmxext
-cglobal x264_quant_4x4_core32_mmxext
-cglobal x264_quant_8x8_core32_mmxext
-
-cglobal x264_dequant_4x4_mmx
-cglobal x264_dequant_8x8_mmx
-
%macro MMX_QUANT_AC_START 0
; mov rdi, rdi ; &dct[0][0]
; mov rsi, rsi ; &quant_mf[0][0]
movdqa %1, xmm0 ; store
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_2x2_dc_core15_mmx:
+cglobal x264_quant_2x2_dc_core15_mmx
MMX_QUANT15_DC_START
MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_dc_core15_mmx:
+cglobal x264_quant_4x4_dc_core15_mmx
MMX_QUANT15_DC_START
%rep 4
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_4x4_core15_mmx( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_core15_mmx:
+cglobal x264_quant_4x4_core15_mmx
MMX_QUANT_AC_START
%rep 4
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_8x8_core15_mmx( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_8x8_core15_mmx:
+cglobal x264_quant_8x8_core15_mmx
MMX_QUANT_AC_START
%rep 16
ret
%ifdef HAVE_SSE3
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_dc_core15_ssse3:
+cglobal x264_quant_4x4_dc_core15_ssse3
SSE2_QUANT15_DC_START
SSSE3_QUANT15_1x8 [parm1q], xmm5, xmm6, xmm7
SSSE3_QUANT15_1x8 [parm1q+16], xmm5, xmm6, xmm7
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_core15_ssse3:
+cglobal x264_quant_4x4_core15_ssse3
SSE2_QUANT_AC_START
%assign x 0
%rep 2
%endrep
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_8x8_core15_ssse3:
+cglobal x264_quant_8x8_core15_ssse3
SSE2_QUANT_AC_START
%assign x 0
%rep 8
movq %1, mm0 ; store
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_2x2_dc_core16_mmxext:
+cglobal x264_quant_2x2_dc_core16_mmxext
MMXEXT_QUANT16_DC_START
MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_dc_core16_mmxext:
+cglobal x264_quant_4x4_dc_core16_mmxext
MMXEXT_QUANT16_DC_START
%rep 4
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_core16_mmxext:
+cglobal x264_quant_4x4_core16_mmxext
MMX_QUANT_AC_START
%rep 4
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_8x8_core16_mmxext:
+cglobal x264_quant_8x8_core16_mmxext
MMX_QUANT_AC_START
%rep 16
movq %1, mm0 ; store
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_2x2_dc_core32_mmxext:
+cglobal x264_quant_2x2_dc_core32_mmxext
MMX_QUANT32_DC_START
MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_dc_core32_mmxext:
+cglobal x264_quant_4x4_dc_core32_mmxext
MMX_QUANT32_DC_START
%rep 4
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_core32_mmxext:
+cglobal x264_quant_4x4_core32_mmxext
MMX_QUANT_AC_START
%rep 4
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_8x8_core32_mmxext:
+cglobal x264_quant_8x8_core32_mmxext
MMX_QUANT_AC_START
%rep 16
movq %1, mm0
%endmacro
+;-----------------------------------------------------------------------------
+; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+;-----------------------------------------------------------------------------
%macro DEQUANT_WxH 3
-ALIGN 16
-;;; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
-%1:
+cglobal %1
; mov rdi, rdi ; dct
; mov rsi, rsi ; dequant_mf
; mov edx, edx ; i_qp
SECTION .text
-cglobal x264_cpu_cpuid_test
-cglobal x264_cpu_cpuid
-cglobal x264_emms
-
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_cpu_cpuid_test( void ) return 0 if unsupported
;-----------------------------------------------------------------------------
-x264_cpu_cpuid_test:
+cglobal x264_cpu_cpuid_test
pushfd
push ebx
push ebp
popfd
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
-x264_cpu_cpuid:
+cglobal x264_cpu_cpuid
push ebp
mov ebp, esp
pop ebp
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_emms( void )
;-----------------------------------------------------------------------------
-x264_emms:
+cglobal x264_emms
emms
ret
SECTION .text
-cglobal x264_dct4x4dc_mmx
-
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_dct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
-x264_dct4x4dc_mmx:
+cglobal x264_dct4x4dc_mmx
mov eax, [esp+ 4]
movq mm0, [eax+ 0]
movq mm1, [eax+ 8]
picpop ebx
ret
-cglobal x264_idct4x4dc_mmx
-
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_idct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
-x264_idct4x4dc_mmx:
+cglobal x264_idct4x4dc_mmx
mov eax, [esp+ 4]
movq mm0, [eax+ 0]
movq mm1, [eax+ 8]
movq [eax+24], mm4
ret
-cglobal x264_sub4x4_dct_mmx
-
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-x264_sub4x4_dct_mmx:
+cglobal x264_sub4x4_dct_mmx
mov eax, [esp+ 8] ; pix1
mov ecx, [esp+12] ; pix2
ret
-cglobal x264_add4x4_idct_mmx
-
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-x264_add4x4_idct_mmx:
+cglobal x264_add4x4_idct_mmx
; Load dct coeffs
mov eax, [esp+ 8] ; dct
movq mm0, [eax+ 0]
MMX_SUMSUB_BA %1, %2
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
;-----------------------------------------------------------------------------
+ALIGN 16
x264_pixel_sub_8x8_mmx:
mov edx, [esp+ 4] ; diff
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_ydct8_mmx( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
+ALIGN 16
x264_ydct8_mmx:
mov eax, [esp+04] ; dest
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_yidct8_mmx( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
+ALIGN 16
x264_yidct8_mmx:
mov eax, [esp+04] ; dest
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_pixel_add_8x8_mmx( uint8_t *dst, int16_t src[8][8] );
;-----------------------------------------------------------------------------
+ALIGN 16
x264_pixel_add_8x8_mmx:
mov eax, [esp+4] ; dst
mov edx, [esp+8] ; src
%endrep
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_transpose_8x8_mmx( int16_t src[8][8] );
;-----------------------------------------------------------------------------
+ALIGN 16
x264_transpose_8x8_mmx:
mov eax, [esp+4]
;-----------------------------------------------------------------------------
; void __cdecl x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-ALIGN 16
cglobal x264_sub8x8_dct8_mmx
-x264_sub8x8_dct8_mmx:
push dword [esp+12]
push dword [esp+12]
push dword [esp+12]
;-----------------------------------------------------------------------------
; void __cdecl x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-ALIGN 16
cglobal x264_add8x8_idct8_mmx
-x264_add8x8_idct8_mmx:
mov eax, [esp+8]
add word [eax], 32
push eax
; uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 4
-ALIGN 16
cglobal %1
-%1:
mov edx, [esp+12]
mov ecx, [esp+ 8]
mov eax, [esp+ 4]
; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 4
-ALIGN 16
cglobal %1
-%1:
mov ecx, [esp+8]
mov eax, [esp+4]
add ecx, %3
;-----------------------------------------------------------------------------
; void __cdecl x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-ALIGN 16
cglobal x264_zigzag_scan_4x4_field_mmx
-x264_zigzag_scan_4x4_field_mmx:
mov edx, [esp+8]
mov ecx, [esp+4]
punpcklwd mm0, [edx]
pb_a1: times 8 db 0xa1
SECTION .text
-cglobal x264_deblock_v8_luma_mmxext
-cglobal x264_deblock_h_luma_mmxext
-cglobal x264_deblock_v_chroma_mmxext
-cglobal x264_deblock_h_chroma_mmxext
-cglobal x264_deblock_v_chroma_intra_mmxext
-cglobal x264_deblock_h_chroma_intra_mmxext
; expands to [base],...,[base+7*stride]
%define PASS8ROWS(base, base3, stride, stride3) \
SECTION .text
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-x264_deblock_v8_luma_mmxext:
+cglobal x264_deblock_v8_luma_mmxext
picpush ebx
picgetgot ebx
push edi
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-x264_deblock_h_luma_mmxext:
+cglobal x264_deblock_h_luma_mmxext
push ebx
push ebp
mov eax, [esp+12] ; pix
ret
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-x264_deblock_v_chroma_mmxext:
+cglobal x264_deblock_v_chroma_mmxext
CHROMA_V_START
push ebx
mov ebx, [esp+32] ; tc0
CHROMA_END
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-x264_deblock_h_chroma_mmxext:
+cglobal x264_deblock_h_chroma_mmxext
CHROMA_H_START
push ebx
mov ebx, [esp+36] ; tc0
paddb mm2, mm6
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-x264_deblock_v_chroma_intra_mmxext:
+cglobal x264_deblock_v_chroma_intra_mmxext
CHROMA_V_START
picpush ebx
picgetgot ebx
picpop ebx
CHROMA_END
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-x264_deblock_h_chroma_intra_mmxext:
+cglobal x264_deblock_h_chroma_intra_mmxext
CHROMA_H_START
picpush ebx
picgetgot ebx
%else
global %1
%endif
+ align 16
+ %1:
%endmacro
; Name of the .rodata section. On OS X we cannot use .rodata because NASM
SECTION .text
-cglobal x264_pixel_avg_w4_mmxext
-cglobal x264_pixel_avg_w8_mmxext
-cglobal x264_pixel_avg_w16_mmxext
-cglobal x264_pixel_avg_w20_mmxext
-cglobal x264_pixel_avg_w16_sse2
-
-cglobal x264_pixel_avg_weight_4x4_mmxext
-cglobal x264_pixel_avg_weight_w8_mmxext
-cglobal x264_pixel_avg_weight_w16_mmxext
-
-cglobal x264_mc_copy_w4_mmx
-cglobal x264_mc_copy_w8_mmx
-cglobal x264_mc_copy_w16_mmx
-cglobal x264_mc_copy_w16_sse2
-
-cglobal x264_mc_chroma_mmxext
-
-cglobal x264_prefetch_fenc_mmxext
-cglobal x264_prefetch_ref_mmxext
-
;=============================================================================
; pixel avg
;=============================================================================
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src1, int i_src1_stride,
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
-x264_pixel_avg_w4_mmxext:
+cglobal x264_pixel_avg_w4_mmxext
push ebp
push ebx
push esi
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w8_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src1, int i_src1_stride,
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
-x264_pixel_avg_w8_mmxext:
+cglobal x264_pixel_avg_w8_mmxext
push ebp
push ebx
push esi
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w16_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src1, int i_src1_stride,
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
-x264_pixel_avg_w16_mmxext:
+cglobal x264_pixel_avg_w16_mmxext
push ebp
push ebx
push esi
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w20_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src1, int i_src1_stride,
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
-x264_pixel_avg_w20_mmxext:
+cglobal x264_pixel_avg_w20_mmxext
push ebp
push ebx
push esi
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
-x264_pixel_avg_w16_sse2:
+cglobal x264_pixel_avg_w16_sse2
push ebp
push ebx
push esi
ret
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int )
;-----------------------------------------------------------------------------
-x264_pixel_avg_weight_w16_mmxext:
+cglobal x264_pixel_avg_weight_w16_mmxext
BIWEIGHT_START_MMX
mov eax, [picesp+32] ; i_height
ALIGN 4
jg .height_loop
BIWEIGHT_END_MMX
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int )
;-----------------------------------------------------------------------------
-x264_pixel_avg_weight_w8_mmxext:
+cglobal x264_pixel_avg_weight_w8_mmxext
BIWEIGHT_START_MMX
mov eax, [picesp+32]
ALIGN 4
jg .height_loop
BIWEIGHT_END_MMX
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int )
;-----------------------------------------------------------------------------
-x264_pixel_avg_weight_4x4_mmxext:
+cglobal x264_pixel_avg_weight_4x4_mmxext
BIWEIGHT_START_MMX
BIWEIGHT_4P_MMX [edi ], [edx ]
BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ]
; pixel copy
;=============================================================================
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_mc_copy_w4_mmx( uint8_t *src, int i_src_stride,
; uint8_t *dst, int i_dst_stride, int i_height )
;-----------------------------------------------------------------------------
-x264_mc_copy_w4_mmx:
+cglobal x264_mc_copy_w4_mmx
push ebx
push esi
push edi
pop ebx
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_mc_copy_w8_mmx( uint8_t *src, int i_src_stride,
; uint8_t *dst, int i_dst_stride, int i_height )
;-----------------------------------------------------------------------------
-x264_mc_copy_w8_mmx:
+cglobal x264_mc_copy_w8_mmx
push ebx
push esi
push edi
pop ebx
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_mc_copy_w16_mmx( uint8_t *src, int i_src_stride,
; uint8_t *dst, int i_dst_stride, int i_height )
;-----------------------------------------------------------------------------
-x264_mc_copy_w16_mmx:
+cglobal x264_mc_copy_w16_mmx
push ebx
push esi
push edi
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_mc_copy_w16_sse2( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
;-----------------------------------------------------------------------------
-x264_mc_copy_w16_sse2:
+cglobal x264_mc_copy_w16_sse2
push ebx
push esi
push edi
; chroma MC
;=============================================================================
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
; uint8_t *dst, int i_dst_stride,
; int i_width, int i_height )
;-----------------------------------------------------------------------------
-x264_mc_chroma_mmxext:
+cglobal x264_mc_chroma_mmxext
picpush ebx
picgetgot ebx
push edi
; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
; uint8_t *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
-ALIGN 16
-x264_prefetch_fenc_mmxext:
+cglobal x264_prefetch_fenc_mmxext
mov eax, [esp+20]
mov ecx, [esp+8]
mov edx, [esp+4]
;-----------------------------------------------------------------------------
; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
;-----------------------------------------------------------------------------
-ALIGN 16
-x264_prefetch_ref_mmxext:
+cglobal x264_prefetch_ref_mmxext
mov eax, [esp+12]
mov ecx, [esp+8]
mov edx, [esp+4]
SECTION .text
-cglobal x264_hpel_filter_mmxext
-cglobal x264_plane_copy_mmxext
-
;-----------------------------------------------------------------------------
; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
; int i_stride, int i_width, int i_height );
;-----------------------------------------------------------------------------
-
-ALIGN 16
-x264_hpel_filter_mmxext :
+cglobal x264_hpel_filter_mmxext
push ebp
mov ebp, esp
push ebx
; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
; uint8_t *src, int i_src, int w, int h)
;-----------------------------------------------------------------------------
-ALIGN 16
-x264_plane_copy_mmxext:
+cglobal x264_plane_copy_mmxext
push edi
push esi
push ebx
SECTION .text
-cglobal x264_pixel_sad_16x16_mmxext
-cglobal x264_pixel_sad_16x8_mmxext
-cglobal x264_pixel_sad_8x16_mmxext
-cglobal x264_pixel_sad_8x8_mmxext
-cglobal x264_pixel_sad_8x4_mmxext
-cglobal x264_pixel_sad_4x8_mmxext
-cglobal x264_pixel_sad_4x4_mmxext
-
-cglobal x264_pixel_sad_x3_16x16_mmxext
-cglobal x264_pixel_sad_x3_16x8_mmxext
-cglobal x264_pixel_sad_x3_8x16_mmxext
-cglobal x264_pixel_sad_x3_8x8_mmxext
-cglobal x264_pixel_sad_x3_8x4_mmxext
-cglobal x264_pixel_sad_x3_4x8_mmxext
-cglobal x264_pixel_sad_x3_4x4_mmxext
-
-cglobal x264_pixel_sad_x4_16x16_mmxext
-cglobal x264_pixel_sad_x4_16x8_mmxext
-cglobal x264_pixel_sad_x4_8x16_mmxext
-cglobal x264_pixel_sad_x4_8x8_mmxext
-cglobal x264_pixel_sad_x4_8x4_mmxext
-cglobal x264_pixel_sad_x4_4x8_mmxext
-cglobal x264_pixel_sad_x4_4x4_mmxext
-
-cglobal x264_pixel_sad_pde_16x16_mmxext
-cglobal x264_pixel_sad_pde_16x8_mmxext
-cglobal x264_pixel_sad_pde_8x16_mmxext
-
-cglobal x264_pixel_ssd_16x16_mmx
-cglobal x264_pixel_ssd_16x8_mmx
-cglobal x264_pixel_ssd_8x16_mmx
-cglobal x264_pixel_ssd_8x8_mmx
-cglobal x264_pixel_ssd_8x4_mmx
-cglobal x264_pixel_ssd_4x8_mmx
-cglobal x264_pixel_ssd_4x4_mmx
-
-cglobal x264_pixel_satd_4x4_mmxext
-cglobal x264_pixel_satd_4x8_mmxext
-cglobal x264_pixel_satd_8x4_mmxext
-cglobal x264_pixel_satd_8x8_mmxext
-cglobal x264_pixel_satd_16x8_mmxext
-cglobal x264_pixel_satd_8x16_mmxext
-cglobal x264_pixel_satd_16x16_mmxext
-
-cglobal x264_pixel_sa8d_16x16_mmxext
-cglobal x264_pixel_sa8d_8x8_mmxext
-
-cglobal x264_intra_satd_x3_4x4_mmxext
-cglobal x264_intra_satd_x3_8x8c_mmxext
-cglobal x264_intra_satd_x3_16x16_mmxext
-cglobal x264_intra_sa8d_x3_8x8_core_mmxext
-
-cglobal x264_pixel_ssim_4x4x2_core_mmxext
-
-cglobal x264_pixel_ads4_mmxext
-cglobal x264_pixel_ads2_mmxext
-cglobal x264_pixel_ads1_mmxext
-
%macro SAD_START 0
push ebx
; int __cdecl x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SAD 2
-ALIGN 16
-x264_pixel_sad_%1x%2_mmxext:
+cglobal x264_pixel_sad_%1x%2_mmxext
SAD_START
%rep %2/2
SAD_INC_2x%1P
; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
-ALIGN 16
-x264_pixel_sad_x%1_%2x%3_mmxext:
+cglobal x264_pixel_sad_x%1_%2x%3_mmxext
SAD_X%1_2x%2P 1
%rep %3/2-1
SAD_X%1_2x%2P 0
; int __cdecl x264_pixel_sad_pde_16x16_mmxext (uint8_t *, int, uint8_t *, int, int )
;-----------------------------------------------------------------------------
%macro SAD_PDE 2
-ALIGN 16
-x264_pixel_sad_pde_%1x%2_mmxext:
+cglobal x264_pixel_sad_pde_%1x%2_mmxext
SAD_START
%rep %2/4
SAD_INC_2x%1P
; int __cdecl x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SSD 2
-ALIGN 16
-x264_pixel_ssd_%1x%2_mmx:
+cglobal x264_pixel_ssd_%1x%2_mmx
SSD_START
%rep %2
SSD_INC_1x%1P
ret
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_4x4_mmxext:
+cglobal x264_pixel_satd_4x4_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_4x8_mmxext:
+cglobal x264_pixel_satd_4x8_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 1
LOAD_DIFF_HADAMARD_SUM mm1, 0, 0
paddw mm0, mm1
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x4_mmxext:
+cglobal x264_pixel_satd_8x4_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
sub eax, ebx
paddw mm0, mm1
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x8_mmxext:
+cglobal x264_pixel_satd_8x8_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 1
LOAD_DIFF_HADAMARD_SUM mm1, 0, 0
paddw mm0, mm1
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_16x8_mmxext:
+cglobal x264_pixel_satd_16x8_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 1
LOAD_DIFF_HADAMARD_SUM mm1, 0, 0
paddw mm0, mm1
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x16_mmxext:
+cglobal x264_pixel_satd_8x16_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 1
LOAD_DIFF_HADAMARD_SUM mm1, 0, 1
paddw mm0, mm1
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_16x16_mmxext:
+cglobal x264_pixel_satd_16x16_mmxext
SATD_START
LOAD_DIFF_HADAMARD_SUM mm0, 0, 1
LOAD_DIFF_HADAMARD_SUM mm1, 0, 1
paddw mm0, mm1
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_sa8d_8x8_mmxext:
+cglobal x264_pixel_sa8d_8x8_mmxext
SATD_START
sub esp, 0x70
%define args esp+0x74
%undef spill
%undef trans
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sa8d_16x16_mmxext( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
;; violates calling convention
-x264_pixel_sa8d_16x16_mmxext:
+cglobal x264_pixel_sa8d_16x16_mmxext
push esi
push edi
push ebp
%8 %3, %6
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-x264_intra_satd_x3_4x4_mmxext:
+cglobal x264_intra_satd_x3_4x4_mmxext
push ebx
push edi
push esi
pop ebx
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-x264_intra_satd_x3_16x16_mmxext:
+cglobal x264_intra_satd_x3_16x16_mmxext
push ebx
push ebp
push edi
pop ebx
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-x264_intra_satd_x3_8x8c_mmxext:
+cglobal x264_intra_satd_x3_8x8c_mmxext
push ebx
push ebp
push edi
movq mm7, [spill]
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
-x264_intra_sa8d_x3_8x8_core_mmxext:
+cglobal x264_intra_sa8d_x3_8x8_core_mmxext
mov eax, [esp+4]
mov ecx, [esp+8]
sub esp, 0x70
; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
-ALIGN 16
-x264_pixel_ssim_4x4x2_core_mmxext:
+cglobal x264_pixel_ssim_4x4x2_core_mmxext
push ebx
push edi
mov ebx, [esp+16]
; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *res, int width )
;-----------------------------------------------------------------------------
-ALIGN 16
-x264_pixel_ads4_mmxext:
+cglobal x264_pixel_ads4_mmxext
push ebx
mov eax, [esp+8]
movq mm6, [eax]
pop ebx
ret
-ALIGN 16
-x264_pixel_ads2_mmxext:
+cglobal x264_pixel_ads2_mmxext
push ebx
mov eax, [esp+8]
movq mm6, [eax]
pop ebx
ret
-ALIGN 16
-x264_pixel_ads1_mmxext:
+cglobal x264_pixel_ads1_mmxext
mov eax, [esp+4]
pshufw mm7, [eax], 0
mov eax, [esp+8]
SECTION .text
-
-cglobal x264_pixel_sad_16x16_sse2
-cglobal x264_pixel_sad_16x8_sse2
-cglobal x264_pixel_sad_x3_16x16_sse2
-cglobal x264_pixel_sad_x3_16x8_sse2
-cglobal x264_pixel_sad_x4_16x16_sse2
-cglobal x264_pixel_sad_x4_16x8_sse2
-cglobal x264_pixel_ssd_16x16_sse2
-cglobal x264_pixel_ssd_16x8_sse2
-cglobal x264_pixel_satd_8x4_sse2
-cglobal x264_pixel_satd_8x8_sse2
-cglobal x264_pixel_satd_16x8_sse2
-cglobal x264_pixel_satd_8x16_sse2
-cglobal x264_pixel_satd_16x16_sse2
-cglobal x264_pixel_ssim_4x4x2_core_sse2
-cglobal x264_pixel_ssim_end4_sse2
-
-
%macro HADDW 2 ; sum junk
; ebx is no longer used at this point, so no push needed
picgetgot ebx
ret
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_sad_16x16_sse2:
+cglobal x264_pixel_sad_16x16_sse2
SAD_START_SSE2
movdqu xmm0, [ecx]
movdqu xmm1, [ecx+edx]
paddw xmm0, xmm7
SAD_END_SSE2
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_sad_16x8_sse2:
+cglobal x264_pixel_sad_16x8_sse2
SAD_START_SSE2
pxor xmm0, xmm0
SAD_INC_4x16P_SSE2
ret
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
-ALIGN 16
-x264_pixel_sad_x%1_%2x%3_sse2:
+cglobal x264_pixel_sad_x%1_%2x%3_sse2
SAD_X%1_2x%2P 1
%rep %3/2-1
SAD_X%1_2x%2P 0
ret
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_ssd_16x16_sse2:
+cglobal x264_pixel_ssd_16x16_sse2
SSD_START_SSE2
%rep 8
SSD_INC_2x16P_SSE2
%endrep
SSD_END_SSE2
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_ssd_16x8_sse2:
+cglobal x264_pixel_ssd_16x8_sse2
SSD_START_SSE2
%rep 4
SSD_INC_2x16P_SSE2
ret
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_16x16_sse2:
+cglobal x264_pixel_satd_16x16_sse2
SATD_START
SATD_TWO_SSE2
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x16_sse2:
+cglobal x264_pixel_satd_8x16_sse2
SATD_START
SATD_TWO_SSE2
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_16x8_sse2:
+cglobal x264_pixel_satd_16x8_sse2
SATD_START
SATD_TWO_SSE2
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x8_sse2:
+cglobal x264_pixel_satd_8x8_sse2
SATD_START
SATD_TWO_SSE2
SATD_END
-ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x4_sse2:
+cglobal x264_pixel_satd_8x4_sse2
SATD_START
SATD_TWO_SSE2
; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
-ALIGN 16
-x264_pixel_ssim_4x4x2_core_sse2:
+cglobal x264_pixel_ssim_4x4x2_core_sse2
push ebx
mov eax, [esp+ 8]
mov ebx, [esp+12]
;-----------------------------------------------------------------------------
; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
-ALIGN 16
-x264_pixel_ssim_end4_sse2:
+cglobal x264_pixel_ssim_end4_sse2
mov eax, [esp+ 4]
mov ecx, [esp+ 8]
mov edx, [esp+12]
SECTION .text
-cglobal predict_8x8_v_mmxext
-cglobal predict_8x8_dc_mmxext
-cglobal predict_8x8_dc_top_mmxext
-cglobal predict_8x8_dc_left_mmxext
-cglobal predict_8x8_ddl_mmxext
-cglobal predict_8x8_ddr_mmxext
-cglobal predict_8x8_vr_core_mmxext
-cglobal predict_8x8c_v_mmx
-cglobal predict_8x8c_dc_core_mmxext
-cglobal predict_8x8c_p_core_mmxext
-cglobal predict_16x16_p_core_mmxext
-cglobal predict_16x16_v_mmx
-cglobal predict_16x16_dc_core_mmxext
-cglobal predict_16x16_dc_top_mmxext
-
-
; dest, left, right, src, tmp
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED8x8_LOWPASS 5
;-----------------------------------------------------------------------------
; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8_v_mmxext:
+cglobal predict_8x8_v_mmxext
mov eax, [esp+8]
mov edx, [esp+4]
movq mm0, [eax+16]
;-----------------------------------------------------------------------------
; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8_dc_mmxext:
+cglobal predict_8x8_dc_mmxext
picpush ebx
picgetgot ebx
mov eax, [picesp + 8]
; void predict_8x8_top_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
%macro PRED8x8_DC 2
-ALIGN 16
-%1:
+cglobal %1
picpush ebx
picgetgot ebx
mov eax, [picesp + 8]
;-----------------------------------------------------------------------------
; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8_ddl_mmxext:
+cglobal predict_8x8_ddl_mmxext
picpush ebx
picgetgot ebx
mov eax, [picesp + 8]
;-----------------------------------------------------------------------------
; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8_ddr_mmxext:
+cglobal predict_8x8_ddr_mmxext
picpush ebx
picgetgot ebx
mov eax, [picesp + 8]
; 6 .....
; 7 ,,,,,
-ALIGN 16
-predict_8x8_vr_core_mmxext:
+cglobal predict_8x8_vr_core_mmxext
picpush ebx
picgetgot ebx
mov eax, [picesp + 8]
;-----------------------------------------------------------------------------
; void predict_8x8c_v_mmx( uint8_t *src )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8c_v_mmx :
+cglobal predict_8x8c_v_mmx
mov edx, [esp + 4]
movq mm0, [edx - FDEC_STRIDE]
STORE8x8 mm0, mm0
;-----------------------------------------------------------------------------
; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8c_dc_core_mmxext:
+cglobal predict_8x8c_dc_core_mmxext
picpush ebx
picgetgot ebx
;-----------------------------------------------------------------------------
; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_8x8c_p_core_mmxext:
+cglobal predict_8x8c_p_core_mmxext
picpush ebx
picgetgot ebx
;-----------------------------------------------------------------------------
; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_16x16_p_core_mmxext:
-
+cglobal predict_16x16_p_core_mmxext
picpush ebx
picgetgot ebx
;-----------------------------------------------------------------------------
; void predict_16x16_v_mmx( uint8_t *src )
;-----------------------------------------------------------------------------
-
-ALIGN 16
-predict_16x16_v_mmx :
-
+cglobal predict_16x16_v_mmx
mov edx, [esp + 4]
mov ecx, FDEC_STRIDE
sub edx, ecx ; edx <-- line -1
pop edi
%endmacro
-ALIGN 16
-predict_16x16_dc_core_mmxext:
+cglobal predict_16x16_dc_core_mmxext
PRED16x16_DC [esp+8], 5, esp
ret
-ALIGN 16
-predict_16x16_dc_top_mmxext:
+cglobal predict_16x16_dc_top_mmxext
picpush ebx
picgetgot ebx
PRED16x16_DC [pw_8 GOT_ebx], 4, picesp
SECTION .text
-cglobal x264_quant_2x2_dc_core15_mmx
-cglobal x264_quant_4x4_dc_core15_mmx
-cglobal x264_quant_4x4_core15_mmx
-cglobal x264_quant_8x8_core15_mmx
-
-cglobal x264_quant_2x2_dc_core16_mmxext
-cglobal x264_quant_4x4_dc_core16_mmxext
-cglobal x264_quant_4x4_core16_mmxext
-cglobal x264_quant_8x8_core16_mmxext
-
-cglobal x264_quant_2x2_dc_core32_mmxext
-cglobal x264_quant_4x4_dc_core32_mmxext
-cglobal x264_quant_4x4_core32_mmxext
-cglobal x264_quant_8x8_core32_mmxext
-
-cglobal x264_dequant_4x4_mmx
-cglobal x264_dequant_8x8_mmx
-
%macro MMX_QUANT_AC_START 0
mov eax, [esp+ 4] ; &dct[0][0]
mov ecx, [esp+ 8] ; &quant_mf[0][0]
movq %1, mm0 ; store
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_2x2_dc_core15_mmx:
+cglobal x264_quant_2x2_dc_core15_mmx
MMX_QUANT15_DC_START
MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_dc_core15_mmx:
+cglobal x264_quant_4x4_dc_core15_mmx
MMX_QUANT15_DC_START
%rep 4
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_core15_mmx( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_core15_mmx:
+cglobal x264_quant_4x4_core15_mmx
MMX_QUANT_AC_START
%rep 4
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_8x8_core15_mmx( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_8x8_core15_mmx:
+cglobal x264_quant_8x8_core15_mmx
MMX_QUANT_AC_START
%rep 16
movq %1, mm0 ; store
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_2x2_dc_core16_mmxext:
+cglobal x264_quant_2x2_dc_core16_mmxext
MMXEXT_QUANT16_DC_START
MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_dc_core16_mmxext:
+cglobal x264_quant_4x4_dc_core16_mmxext
MMXEXT_QUANT16_DC_START
%rep 4
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_core16_mmxext:
+cglobal x264_quant_4x4_core16_mmxext
MMX_QUANT_AC_START
%rep 4
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_8x8_core16_mmxext:
+cglobal x264_quant_8x8_core16_mmxext
MMX_QUANT_AC_START
%rep 16
movq %1, mm0 ; store
%endmacro
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_2x2_dc_core32_mmxext:
+cglobal x264_quant_2x2_dc_core32_mmxext
MMX_QUANT32_DC_START
MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_dc_core32_mmxext:
+cglobal x264_quant_4x4_dc_core32_mmxext
MMX_QUANT32_DC_START
%rep 4
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_4x4_core32_mmxext:
+cglobal x264_quant_4x4_core32_mmxext
MMX_QUANT_AC_START
%rep 4
ret
-ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
-x264_quant_8x8_core32_mmxext:
+cglobal x264_quant_8x8_core32_mmxext
MMX_QUANT_AC_START
%rep 16
movq %1, mm0
%endmacro
+;-----------------------------------------------------------------------------
+; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+;-----------------------------------------------------------------------------
%macro DEQUANT_WxH 3
-ALIGN 16
-;;; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
-%1:
+cglobal %1
mov edx, [esp+12] ; i_qp
imul eax, edx, 0x2b
shr eax, 8 ; i_qbits = i_qp / 6