From e269ca55e5244280afd0347c1088083cf7043d48 Mon Sep 17 00:00:00 2001 From: Ilia Date: Mon, 28 Nov 2011 05:20:09 -0800 Subject: [PATCH] More 4:2:2 asm functions High bit depth version of deblock_h_chroma_422. Regular and high bit depth versions of deblock_h_chroma_intra_422. High bit depth pixel_vsad. SSE2 high bit depth and MMX 8-bit predict_8x8_vl. Our first GCI patch this year! --- common/dct.c | 3 ++ common/deblock.c | 6 ++- common/pixel.c | 8 +++- common/x86/dct-a.asm | 74 +++++++++++++++++++++++++++++++ common/x86/dct.h | 5 ++- common/x86/deblock-a.asm | 96 +++++++++++++++++++++++++++++++++------- common/x86/pixel.h | 2 + common/x86/predict-a.asm | 45 +++++++++++++++++++ common/x86/predict-c.c | 4 ++ common/x86/predict.h | 6 ++- common/x86/sad16-a.asm | 51 +++++++++++++++++++++ tools/checkasm.c | 24 ++++++---- 12 files changed, 293 insertions(+), 31 deletions(-) diff --git a/common/dct.c b/common/dct.c index 05cd506b..6836bc2b 100644 --- a/common/dct.c +++ b/common/dct.c @@ -511,7 +511,9 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->idct4x4dc = x264_idct4x4dc_sse2; dctf->add8x8_idct = x264_add8x8_idct_sse2; dctf->add16x16_idct = x264_add16x16_idct_sse2; + dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2; + dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2; dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2; } if( cpu&X264_CPU_AVX ) @@ -522,6 +524,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->add8x8_idct = x264_add8x8_idct_avx; dctf->add16x16_idct = x264_add16x16_idct_avx; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx; + dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx; dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx; } #endif // HAVE_MMX diff --git a/common/deblock.c b/common/deblock.c index b90ca897..619bf16d 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -658,6 +658,8 @@ void x264_deblock_v_chroma_intra_sse2( pixel *pix, int stride, int alpha, int be void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, int stride, int alpha, int beta ); void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); @@ -741,6 +743,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2; #if !HIGH_BIT_DEPTH pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2; + pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_mmx2; #endif pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2; @@ -752,9 +755,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) { pf->deblock_strength = x264_deblock_strength_sse2; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2; -#if !HIGH_BIT_DEPTH pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2; -#endif + pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2; if( !(cpu&X264_CPU_STACK_MOD4) ) { pf->deblock_luma[1] = x264_deblock_v_luma_sse2; diff --git a/common/pixel.c b/common/pixel.c index ee0148ff..872a75e2 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -884,7 +884,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) { INIT4( hadamard_ac, _sse2 ); } - + pixf->vsad = x264_pixel_vsad_sse2; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_sse2; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2; @@ -911,7 +911,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) { INIT4( hadamard_ac, _ssse3 ); } - + pixf->vsad = x264_pixel_vsad_ssse3; pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3; @@ -943,6 +943,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx; pixf->ssim_end4 = x264_pixel_ssim_end4_avx; } + if( cpu&X264_CPU_XOP ) + { + pixf->vsad = x264_pixel_vsad_xop; + } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH #if HAVE_MMX diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 7eecc666..6cb3212c 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -31,6 +31,7 @@ %include "x86util.asm" SECTION_RODATA +pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15 pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1 @@ -852,6 +853,79 @@ SUB8x16_DCT_DC %endif ; !HIGH_BIT_DEPTH +%macro DCTDC_4ROW_SSE2 2 + mova %1, [r1+FENC_STRIDEB*%2] + mova m0, [r2+FDEC_STRIDEB*%2] +%assign Y (%2+1) +%rep 3 + paddw %1, [r1+FENC_STRIDEB*Y] + paddw m0, [r2+FDEC_STRIDEB*Y] +%assign Y (Y+1) +%endrep + psubw %1, m0 + pshufd m0, %1, q2301 + paddw %1, m0 +%endmacro + +%ifdef HIGH_BIT_DEPTH +%macro SUB8x8_DCT_DC_10 0 +cglobal sub8x8_dct_dc, 3,3,3 + DCTDC_4ROW_SSE2 m1, 0 + DCTDC_4ROW_SSE2 m2, 4 + mova m0, [pw_ppmmmmpp] + pmaddwd m1, m0 + pmaddwd m2, m0 + pshufd m0, m1, q2200 ; -1 -1 +0 +0 + pshufd m1, m1, q0033 ; +0 +0 +1 +1 + paddd m1, m0 + pshufd m0, m2, q1023 ; -2 +2 -3 +3 + paddd m1, m2 + paddd m1, m0 + mova [r0], m1 + RET +%endmacro +INIT_XMM sse2 +SUB8x8_DCT_DC_10 + +%macro SUB8x16_DCT_DC_10 0 +cglobal sub8x16_dct_dc, 3,3,6 + DCTDC_4ROW_SSE2 m1, 0 + DCTDC_4ROW_SSE2 m2, 4 + DCTDC_4ROW_SSE2 m3, 8 + DCTDC_4ROW_SSE2 m4, 12 + mova m0, [pw_ppmmmmpp] + pmaddwd m1, m0 + pmaddwd m2, m0 + pshufd m5, m1, q2200 ; -1 -1 +0 +0 + pshufd m1, m1, q0033 ; +0 +0 +1 +1 + paddd m1, m5 + pshufd m5, m2, q1023 ; -2 +2 -3 +3 + paddd m1, m2 + paddd m1, m5 ; a6 a2 a4 a0 + pmaddwd m3, m0 + pmaddwd m4, m0 + pshufd m5, m3, q2200 + pshufd m3, m3, q0033 + paddd m3, m5 + pshufd m5, m4, q1023 + paddd m3, m4 + paddd m3, m5 ; a7 a3 a5 a1 + paddd m0, m1, m3 + psubd m1, m3 + pshufd m0, m0, q3120 + pshufd m1, m1, q3120 + punpcklqdq m2, m0, m1 + punpckhqdq m1, m0 + mova [r0+ 0], m2 + mova [r0+16], m1 + RET +%endmacro +INIT_XMM sse2 +SUB8x16_DCT_DC_10 +INIT_XMM avx +SUB8x16_DCT_DC_10 +%endif + ;----------------------------------------------------------------------------- ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] ) ;----------------------------------------------------------------------------- diff --git a/common/x86/dct.h b/common/x86/dct.h index 4fb39e3d..beb67624 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -39,8 +39,9 @@ void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct_dc_sse2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x16_dct_dc_sse2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); +void x264_sub8x16_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); +void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] ); diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 0459ee80..2313a0f0 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -1780,6 +1780,52 @@ INIT_XMM sse2 DEBLOCK_CHROMA INIT_XMM avx DEBLOCK_CHROMA + +%macro DEBLOCK_H_CHROMA_422_INTRA_10 0 +cglobal deblock_h_chroma_422_intra, 4,6,8 + add r1, r1 + mov r4, 64/mmsize +%if mmsize == 16 + lea r5, [r1*3] +%endif +.loop: + CHROMA_H_LOAD r5 + call deblock_intra_body + CHROMA_H_STORE r5 + lea r0, [r0+r1*(mmsize/4)] + dec r4 + jg .loop + REP_RET +%endmacro +INIT_XMM sse2 +DEBLOCK_H_CHROMA_422_INTRA_10 + +%macro DEBLOCK_H_CHROMA_422_10 0 +cglobal deblock_h_chroma_422, 5,7,8 + add r1, r1 + mov r5, 64/mmsize + lea r6, [r1*3] +.loop: + CHROMA_H_LOAD r6 + RESET_MM_PERMUTATION + LOAD_AB m4, m5, r2, r3 + LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 + pxor m4, m4 + movd m6, [r4-1] + psraw m6, 8 + SPLATW m6, m6 + pmaxsw m6, m4 + pand m7, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 + CHROMA_H_STORE r6 + lea r0, [r0+r1*(mmsize/4)] + add r4, mmsize/16 + dec r5 + jg .loop + REP_RET +%endmacro +INIT_XMM sse2 +DEBLOCK_H_CHROMA_422_10 %endif ; HIGH_BIT_DEPTH %ifndef HIGH_BIT_DEPTH @@ -1791,7 +1837,7 @@ DEBLOCK_CHROMA sub t5, r1 %if mmsize==8 mov dword r0m, 2 -.skip_prologue: +.loop: %endif %endmacro @@ -1802,10 +1848,6 @@ DEBLOCK_CHROMA lea t6, [r1*3] mov t5, r0 add r0, t6 -%if mmsize==8 - mov dword r0m, 2 -.skip_prologue: -%endif %endmacro %macro CHROMA_V_LOOP 1 @@ -1816,7 +1858,7 @@ DEBLOCK_CHROMA add r4, 2 %endif dec dword r0m - jg .skip_prologue + jg .loop %endif %endmacro @@ -1828,7 +1870,7 @@ DEBLOCK_CHROMA add r4, 2 %endif dec dword r0m - jg .skip_prologue + jg .loop %endif %endmacro @@ -1865,6 +1907,10 @@ cglobal deblock_v_chroma, 5,6,8 ;----------------------------------------------------------------------------- cglobal deblock_h_chroma, 5,7,8 CHROMA_H_START +%if mmsize==8 + mov dword r0m, 2 +.loop: +%endif TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_inter_body TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) @@ -1888,14 +1934,9 @@ cglobal deblock_h_chroma_422, 5,7,8 %else %define cntr dword r0m %endif - dec r2d - dec r3d - sub r0, 4 - lea t6, [r1*3] - mov t5, r0 - add r0, t6 + CHROMA_H_START mov cntr, 32/mmsize -.skip_prologue: +.loop: TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 @@ -1913,7 +1954,7 @@ cglobal deblock_h_chroma_422, 5,7,8 lea t5, [t5+r1*(mmsize/2)] add r4, mmsize/8 dec cntr - jg .skip_prologue + jg .loop REP_RET %endmacro @@ -1972,6 +2013,10 @@ cglobal deblock_v_chroma_intra, 4,5,8 ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_intra, 4,6,8 CHROMA_H_START +%if mmsize==8 + mov dword r0m, 2 +.loop: +%endif TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_intra_body TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) @@ -1987,6 +2032,27 @@ DEBLOCK_CHROMA_INTRA INIT_MMX mmx2 DEBLOCK_CHROMA_INTRA %endif + +%macro DEBLOCK_H_CHROMA_422_INTRA 0 +cglobal deblock_h_chroma_422_intra, 4,7,8 + CHROMA_H_START + mov r6d, 32/mmsize +.loop: + TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + call chroma_intra_body + TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + lea r0, [r0+r1*(mmsize/2)] + lea t5, [t5+r1*(mmsize/2)] + dec r6d + jg .loop + REP_RET +%endmacro +INIT_XMM sse2 +DEBLOCK_H_CHROMA_422_INTRA +%ifndef ARCH_X86_64 +INIT_MMX mmx2 +DEBLOCK_H_CHROMA_422_INTRA +%endif %endif ; !HIGH_BIT_DEPTH diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 5a88b0f2..a544b8e0 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -153,6 +153,8 @@ int x264_pixel_var2_8x16_ssse3( uint8_t *, int, uint8_t *, int, int * ); int x264_pixel_var2_8x16_xop( uint8_t *, int, uint8_t *, int, int * ); int x264_pixel_vsad_mmx2( pixel *src, int stride, int height ); int x264_pixel_vsad_sse2( pixel *src, int stride, int height ); +int x264_pixel_vsad_ssse3( pixel *src, int stride, int height ); +int x264_pixel_vsad_xop( pixel *src, int stride, int height ); #define DECL_ADS( size, suffix ) \ int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\ diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index 39017f66..59019ca5 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -1406,6 +1406,51 @@ PREDICT_8x8 %endif ; !HIGH_BIT_DEPTH +;----------------------------------------------------------------------------- +; void predict_8x8_vl( pixel *src, pixel *edge ) +;----------------------------------------------------------------------------- +%macro PREDICT_8x8_VL_10 1 +cglobal predict_8x8_vl, 2,2,8 + mova m0, [r1+16*SIZEOF_PIXEL] + mova m1, [r1+24*SIZEOF_PIXEL] + PALIGNR m2, m1, m0, SIZEOF_PIXEL*1, m4 + PSRLPIX m4, m1, 1 + pavg%1 m6, m0, m2 + pavg%1 m7, m1, m4 + add r0, FDEC_STRIDEB*4 + mova [r0-4*FDEC_STRIDEB], m6 + PALIGNR m3, m7, m6, SIZEOF_PIXEL*1, m5 + mova [r0-2*FDEC_STRIDEB], m3 + PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5 + mova [r0+0*FDEC_STRIDEB], m3 + PALIGNR m3, m7, m6, SIZEOF_PIXEL*3, m5 + mova [r0+2*FDEC_STRIDEB], m3 + PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6 + PSLLPIX m5, m0, 1 + PRED8x8_LOWPASS m0, m5, m2, m0, m7 + PRED8x8_LOWPASS m1, m3, m4, m1, m7 + PALIGNR m4, m1, m0, SIZEOF_PIXEL*1, m2 + mova [r0-3*FDEC_STRIDEB], m4 + PALIGNR m4, m1, m0, SIZEOF_PIXEL*2, m2 + mova [r0-1*FDEC_STRIDEB], m4 + PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2 + mova [r0+1*FDEC_STRIDEB], m4 + PALIGNR m4, m1, m0, SIZEOF_PIXEL*4, m2 + mova [r0+3*FDEC_STRIDEB], m4 + RET +%endmacro +%ifdef HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_8x8_VL_10 w +INIT_XMM ssse3 +PREDICT_8x8_VL_10 w +INIT_XMM avx +PREDICT_8x8_VL_10 w +%else +INIT_MMX mmx2 +PREDICT_8x8_VL_10 b +%endif + ;----------------------------------------------------------------------------- ; void predict_8x8_hd( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index 86abc4c6..f41bb892 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -419,6 +419,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_ pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_sse2; pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2; pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2; @@ -429,6 +430,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_ pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_ssse3; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_ssse3; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3; *predict_8x8_filter = x264_predict_8x8_filter_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) @@ -440,6 +442,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_ return; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_avx; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx; *predict_8x8_filter = x264_predict_8x8_filter_avx; #else @@ -449,6 +452,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_ pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmx2; pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmx2; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmx2; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_mmx2; *predict_8x8_filter = x264_predict_8x8_filter_mmx2; #if ARCH_X86 pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmx2; diff --git a/common/x86/predict.h b/common/x86/predict.h index 63e08de1..708830ce 100644 --- a/common/x86/predict.h +++ b/common/x86/predict.h @@ -93,8 +93,10 @@ void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] ); void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] ); void x264_predict_8x8_ddr_ssse3_cache64( pixel *src, pixel edge[36] ); void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] ); -void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_vl_avx( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vl_sse2( pixel *src, pixel edge[36] ); +void x264_predict_8x8_vl_ssse3( pixel *src, pixel edge[36] ); +void x264_predict_8x8_vl_avx( pixel *src, pixel edge[36] ); +void x264_predict_8x8_vl_mmx2( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[36] ); void x264_predict_8x8_vr_ssse3( pixel *src, pixel edge[36] ); diff --git a/common/x86/sad16-a.asm b/common/x86/sad16-a.asm index a0b85205..50389651 100644 --- a/common/x86/sad16-a.asm +++ b/common/x86/sad16-a.asm @@ -347,6 +347,57 @@ SAD_XMM 8, 8 %endrep %endmacro +%macro PIXEL_VSAD 0 +cglobal pixel_vsad, 3,3,8 + mova m0, [r0] + mova m1, [r0+16] + mova m2, [r0+2*r1] + mova m3, [r0+2*r1+16] + lea r0, [r0+4*r1] + psubw m0, m2 + psubw m1, m3 + ABSW2 m0, m1, m0, m1, m4, m5 + paddw m0, m1 + sub r2d, 2 + je .end +.loop: + mova m4, [r0] + mova m5, [r0+16] + mova m6, [r0+2*r1] + mova m7, [r0+2*r1+16] + lea r0, [r0+4*r1] + psubw m2, m4 + psubw m3, m5 + psubw m4, m6 + psubw m5, m7 + ABSW m2, m2, m1 + ABSW m3, m3, m1 + ABSW m4, m4, m1 + ABSW m5, m5, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + paddw m0, m5 + mova m2, m6 + mova m3, m7 + sub r2d, 2 + jg .loop +.end: +%if BIT_DEPTH == 9 + HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682 +%else + HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426 +%endif + movd eax, m0 + RET +%endmacro +INIT_XMM sse2 +PIXEL_VSAD +INIT_XMM ssse3 +PIXEL_VSAD +INIT_XMM xop +PIXEL_VSAD + ;----------------------------------------------------------------------------- ; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1, ; uint16_t *pix2, int i_stride, int scores[3] ) diff --git a/tools/checkasm.c b/tools/checkasm.c index 1f491cfa..e3aca03e 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -426,6 +426,10 @@ static int check_pixel( int cpu_ref, int cpu_new ) } report( "pixel hadamard_ac :" ); + // maximize sum + for( int i = 0; i < 32; i++ ) + for( int j = 0; j < 16; j++ ) + pbuf4[16*i+j] = -((i+j)&1) & PIXEL_MAX; ok = 1; used_asm = 0; if( pixel_asm.vsad != pixel_ref.vsad ) { @@ -434,13 +438,17 @@ static int check_pixel( int cpu_ref, int cpu_new ) int res_c, res_asm; set_func_name( "vsad" ); used_asm = 1; - res_c = call_c( pixel_c.vsad, pbuf1, 16, h ); - res_asm = call_a( pixel_asm.vsad, pbuf1, 16, h ); - if( res_c != res_asm ) + for( int j = 0; j < 2 && ok; j++ ) { - ok = 0; - fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm ); - break; + pixel *p = j ? pbuf4 : pbuf1; + res_c = call_c( pixel_c.vsad, p, 16, h ); + res_asm = call_a( pixel_asm.vsad, p, 16, h ); + if( res_c != res_asm ) + { + ok = 0; + fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm ); + break; + } } } } @@ -721,8 +729,8 @@ static int check_dct( int cpu_ref, int cpu_new ) { int cond_a = (i < 2) ? 1 : ((j&3) == 0 || (j&3) == (i-1)); int cond_b = (i == 0) ? 1 : !cond_a; - enc[0] = enc[1] = cond_a ? PIXEL_MAX : 0; - enc[2] = enc[3] = cond_b ? PIXEL_MAX : 0; + enc[0] = enc[1] = enc[4] = enc[5] = enc[8] = enc[9] = enc[12] = enc[13] = cond_a ? PIXEL_MAX : 0; + enc[2] = enc[3] = enc[6] = enc[7] = enc[10] = enc[11] = enc[14] = enc[15] = cond_b ? PIXEL_MAX : 0; for( int k = 0; k < 4; k++ ) dec[k] = PIXEL_MAX - enc[k]; -- 2.40.0