From 5f5fa1e9dc6a7dd51fa6c2da243e27fae845887d Mon Sep 17 00:00:00 2001 From: Holger Lubitz Date: Wed, 4 Feb 2009 12:46:17 -0800 Subject: [PATCH] Merging Holger's GSOC branch part 2: intra prediction Assembly versions of most remaining 4x4 and 8x8 intra pred functions. Assembly version of predict_8x8_filter. A few other optimizations. Primarily Core 2-optimized. --- common/common.h | 1 + common/predict.c | 12 +- common/predict.h | 3 +- common/x86/predict-a.asm | 461 ++++++++++++++++++++++++++++++++++++--- common/x86/predict-c.c | 62 ++++-- common/x86/predict.h | 3 +- encoder/analyse.c | 2 +- encoder/encoder.c | 2 +- tools/checkasm.c | 32 ++- 9 files changed, 520 insertions(+), 58 deletions(-) diff --git a/common/common.h b/common/common.h index 97c68781..394f9dab 100644 --- a/common/common.h +++ b/common/common.h @@ -616,6 +616,7 @@ struct x264_t x264_predict_t predict_8x8c[4+3]; x264_predict8x8_t predict_8x8[9+3]; x264_predict_t predict_4x4[9+3]; + x264_predict_8x8_filter_t predict_8x8_filter; x264_pixel_function_t pixf; x264_mc_functions_t mc; diff --git a/common/predict.c b/common/predict.c index 9b6c600b..3c6cb108 100644 --- a/common/predict.c +++ b/common/predict.c @@ -506,7 +506,7 @@ void x264_predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, in int have_lt = i_neighbor & MB_TOPLEFT; if( i_filters & MB_LEFT ) { - edge[15] = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2; + edge[15] = (SRC(0,-1) + 2*SRC(-1,-1) + SRC(-1,0) + 2) >> 2; edge[14] = ((have_lt ? SRC(-1,-1) : SRC(-1,0)) + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) @@ -519,8 +519,8 @@ void x264_predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, in edge[16] = ((have_lt ? SRC(-1,-1) : SRC(0,-1)) + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) - edge[23] = ((have_tr ? SRC(8,-1) : SRC(7,-1)) - + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2; + edge[23] = (SRC(6,-1) + 2*SRC(7,-1) + + (have_tr ? SRC(8,-1) : SRC(7,-1)) + 2) >> 2; if( i_filters & MB_TOPRIGHT ) { @@ -563,7 +563,6 @@ void x264_predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, in src += FDEC_STRIDE; \ } -/* SIMD is much faster than C for all of these except HU and HD. */ static void predict_8x8_dc_128( uint8_t *src, uint8_t edge[33] ) { PREDICT_8x8_DC(0x80808080); @@ -795,7 +794,7 @@ void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] ) #endif } -void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12] ) +void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter ) { pf[I_PRED_8x8_V] = predict_8x8_v; pf[I_PRED_8x8_H] = predict_8x8_h; @@ -809,9 +808,10 @@ void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12] ) pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left; pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top; pf[I_PRED_8x8_DC_128] = predict_8x8_dc_128; + *predict_8x8_filter = x264_predict_8x8_filter; #ifdef HAVE_MMX - x264_predict_8x8_init_mmx( cpu, pf ); + x264_predict_8x8_init_mmx( cpu, pf, predict_8x8_filter ); #endif } diff --git a/common/predict.h b/common/predict.h index 65912190..630cadd1 100644 --- a/common/predict.h +++ b/common/predict.h @@ -26,6 +26,7 @@ typedef void (*x264_predict_t)( uint8_t *src ); typedef void (*x264_predict8x8_t)( uint8_t *src, uint8_t edge[33] ); +typedef void (*x264_predict_8x8_filter_t) ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters ); enum intra_chroma_pred_e { @@ -111,7 +112,7 @@ void x264_predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, in void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] ); void x264_predict_8x8c_init ( int cpu, x264_predict_t pf[7] ); void x264_predict_4x4_init ( int cpu, x264_predict_t pf[12] ); -void x264_predict_8x8_init ( int cpu, x264_predict8x8_t pf[12] ); +void x264_predict_8x8_init ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter ); #endif diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index 0f64ca21..5596189e 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -25,14 +25,15 @@ %include "x86util.asm" %macro STORE8x8 2 - movq [r0 + 0*FDEC_STRIDE], %1 - movq [r0 + 1*FDEC_STRIDE], %1 - movq [r0 + 2*FDEC_STRIDE], %1 - movq [r0 + 3*FDEC_STRIDE], %1 - movq [r0 + 4*FDEC_STRIDE], %2 - movq [r0 + 5*FDEC_STRIDE], %2 - movq [r0 + 6*FDEC_STRIDE], %2 - movq [r0 + 7*FDEC_STRIDE], %2 + add r0, 4*FDEC_STRIDE + movq [r0 + -4*FDEC_STRIDE], %1 + movq [r0 + -3*FDEC_STRIDE], %1 + movq [r0 + -2*FDEC_STRIDE], %1 + movq [r0 + -1*FDEC_STRIDE], %1 + movq [r0 + 0*FDEC_STRIDE], %2 + movq [r0 + 1*FDEC_STRIDE], %2 + movq [r0 + 2*FDEC_STRIDE], %2 + movq [r0 + 3*FDEC_STRIDE], %2 %endmacro %macro STORE16x16 2 @@ -52,15 +53,24 @@ %endmacro %macro STORE16x16_SSE2 1 - mov r1d, 4 -.loop: - movdqa [r0 + 0*FDEC_STRIDE], %1 - movdqa [r0 + 1*FDEC_STRIDE], %1 - movdqa [r0 + 2*FDEC_STRIDE], %1 - movdqa [r0 + 3*FDEC_STRIDE], %1 - add r0, 4*FDEC_STRIDE - dec r1d - jg .loop + add r0, 4*FDEC_STRIDE + movdqa [r0 + -4*FDEC_STRIDE], %1 + movdqa [r0 + -3*FDEC_STRIDE], %1 + movdqa [r0 + -2*FDEC_STRIDE], %1 + movdqa [r0 + -1*FDEC_STRIDE], %1 + movdqa [r0 + 0*FDEC_STRIDE], %1 + movdqa [r0 + 1*FDEC_STRIDE], %1 + movdqa [r0 + 2*FDEC_STRIDE], %1 + movdqa [r0 + 3*FDEC_STRIDE], %1 + add r0, 8*FDEC_STRIDE + movdqa [r0 + -4*FDEC_STRIDE], %1 + movdqa [r0 + -3*FDEC_STRIDE], %1 + movdqa [r0 + -2*FDEC_STRIDE], %1 + movdqa [r0 + -1*FDEC_STRIDE], %1 + movdqa [r0 + 0*FDEC_STRIDE], %1 + movdqa [r0 + 1*FDEC_STRIDE], %1 + movdqa [r0 + 2*FDEC_STRIDE], %1 + movdqa [r0 + 3*FDEC_STRIDE], %1 %endmacro SECTION_RODATA @@ -102,18 +112,17 @@ SECTION .text ; void predict_4x4_ddl_mmxext( uint8_t *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_ddl_mmxext, 1,1 - sub r0, FDEC_STRIDE - movq mm3, [r0] - movq mm1, [r0-1] - movq mm2, mm3 - movq mm4, [pb_0s_ff GLOBAL] - psrlq mm2, 8 - pand mm4, mm3 - por mm2, mm4 + movq mm1, [r0-FDEC_STRIDE] + movq mm2, mm1 + movq mm3, mm1 + movq mm4, mm1 + psllq mm1, 8 + pxor mm2, mm1 + psrlq mm2, 8 + pxor mm3, mm2 + PRED8x8_LOWPASS mm0, mm1, mm3, mm4, mm5 - PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5 - -%assign Y 1 +%assign Y 0 %rep 4 psrlq mm0, 8 movd [r0+Y*FDEC_STRIDE], mm0 @@ -122,6 +131,121 @@ cglobal predict_4x4_ddl_mmxext, 1,1 RET +;----------------------------------------------------------------------------- +; void predict_4x4_ddr_mmxext( uint8_t *src ) +;----------------------------------------------------------------------------- +%macro PREDICT_4x4 1 +cglobal predict_4x4_ddr_%1, 1,1 + movq mm1, [r0+1*FDEC_STRIDE-8] + movq mm2, [r0+0*FDEC_STRIDE-8] + punpckhbw mm2, [r0-1*FDEC_STRIDE-8] + movd mm3, [r0-1*FDEC_STRIDE] + punpckhwd mm1, mm2 + PALIGNR mm3, mm1, 5, mm4 + movq mm1, mm3 + PALIGNR mm3, [r0+2*FDEC_STRIDE-8], 7, mm4 + movq mm2, mm3 + PALIGNR mm3, [r0+3*FDEC_STRIDE-8], 7, mm4 + PRED8x8_LOWPASS mm0, mm3, mm1, mm2, mm4 +%assign Y 3 + movd [r0+Y*FDEC_STRIDE], mm0 +%rep 3 +%assign Y (Y-1) + psrlq mm0, 8 + movd [r0+Y*FDEC_STRIDE], mm0 +%endrep + RET + +cglobal predict_4x4_vr_%1, 1,1 + movd mm0, [r0-1*FDEC_STRIDE] ; ........t3t2t1t0 + movq mm7, mm0 + PALIGNR mm0, [r0-1*FDEC_STRIDE-8], 7, mm1 ; ......t3t2t1t0lt + pavgb mm7, mm0 + PALIGNR mm0, [r0+0*FDEC_STRIDE-8], 7, mm1 ; ....t3t2t1t0ltl0 + movq mm1, mm0 + PALIGNR mm0, [r0+1*FDEC_STRIDE-8], 7, mm2 ; ..t3t2t1t0ltl0l1 + movq mm2, mm0 + PALIGNR mm0, [r0+2*FDEC_STRIDE-8], 7, mm3 ; t3t2t1t0ltl0l1l2 + PRED8x8_LOWPASS mm3, mm1, mm0, mm2, mm4 + movq mm1, mm3 + psrlq mm3, 16 + psllq mm1, 48 + movd [r0+0*FDEC_STRIDE], mm7 + movd [r0+1*FDEC_STRIDE], mm3 + PALIGNR mm7, mm1, 7, mm2 + psllq mm1, 8 + movd [r0+2*FDEC_STRIDE], mm7 + PALIGNR mm3, mm1, 7, mm2 + movd [r0+3*FDEC_STRIDE], mm3 + RET + +cglobal predict_4x4_hd_%1, 1,1 + movd mm0, [r0-1*FDEC_STRIDE-4] ; lt .. + punpckldq mm0, [r0-1*FDEC_STRIDE] ; t3 t2 t1 t0 lt .. .. .. + psllq mm0, 8 ; t2 t1 t0 lt .. .. .. .. + movq mm1, [r0+3*FDEC_STRIDE-8] ; l3 + punpckhbw mm1, [r0+2*FDEC_STRIDE-8] ; l2 l3 + movq mm2, [r0+1*FDEC_STRIDE-8] ; l1 + punpckhbw mm2, [r0+0*FDEC_STRIDE-8] ; l0 l1 + punpckhwd mm1, mm2 ; l0 l1 l2 l3 + punpckhdq mm1, mm0 ; t2 t1 t0 lt l0 l1 l2 l3 + movq mm0, mm1 + movq mm2, mm1 + movq mm7, mm1 + psrlq mm0, 16 ; .. .. t2 t1 t0 lt l0 l1 + psrlq mm2, 8 ; .. t2 t1 t0 lt l0 l1 l2 + pavgb mm7, mm2 + PRED8x8_LOWPASS mm3, mm1, mm0, mm2, mm4 + punpcklbw mm7, mm3 + psrlq mm3, 32 + PALIGNR mm3, mm7, 6, mm6 +%assign Y 3 + movd [r0+Y*FDEC_STRIDE], mm7 +%rep 2 +%assign Y (Y-1) + psrlq mm7, 16 + movd [r0+Y*FDEC_STRIDE], mm7 +%endrep + movd [r0+0*FDEC_STRIDE], mm3 + RET +%endmacro + +%define PALIGNR PALIGNR_MMX +PREDICT_4x4 mmxext +%define PALIGNR PALIGNR_SSSE3 +PREDICT_4x4 ssse3 + +;----------------------------------------------------------------------------- +; void predict_4x4_hu_mmxext( uint8_t *src ) +;----------------------------------------------------------------------------- +cglobal predict_4x4_hu_mmxext, 1,1 + movq mm0, [r0+0*FDEC_STRIDE-8] + punpckhbw mm0, [r0+1*FDEC_STRIDE-8] + movq mm1, [r0+2*FDEC_STRIDE-8] + punpckhbw mm1, [r0+3*FDEC_STRIDE-8] + punpckhwd mm0, mm1 + movq mm1, mm0 + punpckhbw mm1, mm1 + pshufw mm1, mm1, 0xFF + punpckhdq mm0, mm1 + movq mm2, mm0 + movq mm3, mm0 + movq mm7, mm0 + psrlq mm2, 16 + psrlq mm3, 8 + pavgb mm7, mm3 + PRED8x8_LOWPASS mm4, mm0, mm2, mm3, mm5 + punpcklbw mm7, mm4 +%assign Y 0 + movd [r0+Y*FDEC_STRIDE], mm7 +%rep 2 +%assign Y (Y+1) + psrlq mm7, 16 + movd [r0+Y*FDEC_STRIDE], mm7 +%endrep + movd [r0+3*FDEC_STRIDE], mm1 + RET + ;----------------------------------------------------------------------------- ; void predict_4x4_vl_mmxext( uint8_t *src ) ;----------------------------------------------------------------------------- @@ -170,6 +294,120 @@ cglobal predict_4x4_dc_mmxext, 1,4 mov [r0+FDEC_STRIDE*3], r1d RET +%macro PREDICT_FILTER 1 +;----------------------------------------------------------------------------- +;void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters ) +;----------------------------------------------------------------------------- + +cglobal predict_8x8_filter_%1, 4,5 + add r0, 0x58 +%define src r0-0x58 +%ifndef ARCH_X86_64 + mov r4, r1 +%define t1 r4 +%define t4 r1 +%else +%define t1 r1 +%define t4 r4 +%endif + test r3b, 0x01 + je .check_top + movq mm0, [src+0*FDEC_STRIDE-8] + punpckhbw mm0, [src-1*FDEC_STRIDE-8] + movq mm1, [src+2*FDEC_STRIDE-8] + punpckhbw mm1, [src+1*FDEC_STRIDE-8] + punpckhwd mm1, mm0 + movq mm2, [src+4*FDEC_STRIDE-8] + punpckhbw mm2, [src+3*FDEC_STRIDE-8] + movq mm3, [src+6*FDEC_STRIDE-8] + punpckhbw mm3, [src+5*FDEC_STRIDE-8] + punpckhwd mm3, mm2 + punpckhdq mm3, mm1 + movq mm0, [src+7*FDEC_STRIDE-8] + movq mm1, [src-1*FDEC_STRIDE] + movq mm4, mm3 + movq mm2, mm3 + PALIGNR mm4, mm0, 7, mm0 + PALIGNR mm1, mm2, 1, mm2 + test r2b, 0x08 + je .fix_lt_1 +.do_left: + movq mm0, mm4 + PRED8x8_LOWPASS mm2, mm1, mm4, mm3, mm5 + movq [t1+8], mm2 + movq mm4, mm0 + PRED8x8_LOWPASS mm1, mm3, mm0, mm4, mm5 + movd t4, mm1 + mov [t1+7], t4b +.check_top: + test r3b, 0x02 + je .done + movq mm0, [src-1*FDEC_STRIDE-8] + movq mm3, [src-1*FDEC_STRIDE] + movq mm1, [src-1*FDEC_STRIDE+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r2b, 0x08 + je .fix_lt_2 + test r2b, 0x04 + je .fix_tr_1 +.do_top: + PRED8x8_LOWPASS mm4, mm2, mm1, mm3, mm5 + movq [t1+16], mm4 + test r3b, 0x04 + je .done + test r2b, 0x04 + je .fix_tr_2 + movq mm0, [src-1*FDEC_STRIDE+8] + movq mm5, mm0 + movq mm2, mm0 + movq mm4, mm0 + psrlq mm5, 56 + PALIGNR mm2, mm3, 7, mm3 + PALIGNR mm5, mm4, 1, mm4 + PRED8x8_LOWPASS mm1, mm2, mm5, mm0, mm4 + jmp .do_topright +.fix_tr_2: + punpckhbw mm3, mm3 + pshufw mm1, mm3, 0xFF +.do_topright: + movq [t1+24], mm1 + psrlq mm1, 56 + movd t4, mm1 + mov [t1+32], t4b +.done: + REP_RET +.fix_lt_1: + movq mm5, mm3 + pxor mm5, mm4 + psrlq mm5, 56 + psllq mm5, 48 + pxor mm1, mm5 + jmp .do_left +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2b, 0x04 + jne .do_top +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 + jmp .do_top +%endmacro + +%define PALIGNR PALIGNR_MMX +PREDICT_FILTER mmxext +%define PALIGNR PALIGNR_SSSE3 +PREDICT_FILTER ssse3 + ;----------------------------------------------------------------------------- ; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- @@ -411,6 +649,173 @@ cglobal predict_8x8_vr_core_mmxext, 2,2 RET + +;----------------------------------------------------------------------------- +; void predict_8x8_hd_mmxext( uint8_t *src, uint8_t *edge ) +;----------------------------------------------------------------------------- +%define PALIGNR PALIGNR_MMX +cglobal predict_8x8_hd_mmxext, 2,2 + add r0, 4*FDEC_STRIDE + movq mm0, [r1] ; l7 .. .. .. .. .. .. .. + movq mm1, [r1+8] ; lt l0 l1 l2 l3 l4 l5 l6 + movq mm2, [r1+16] ; t7 t6 t5 t4 t3 t2 t1 t0 + movq mm3, mm1 ; lt l0 l1 l2 l3 l4 l5 l6 + movq mm4, mm2 ; t7 t6 t5 t4 t3 t2 t1 t0 + PALIGNR mm2, mm1, 7, mm5 ; t6 t5 t4 t3 t2 t1 t0 lt + PALIGNR mm1, mm0, 7, mm6 ; l0 l1 l2 l3 l4 l5 l6 l7 + PALIGNR mm4, mm3, 1, mm7 ; t0 lt l0 l1 l2 l3 l4 l5 + movq mm5, mm3 + pavgb mm3, mm1 + PRED8x8_LOWPASS mm0, mm4, mm1, mm5, mm7 + movq mm4, mm2 + movq mm1, mm2 ; t6 t5 t4 t3 t2 t1 t0 lt + psrlq mm4, 16 ; .. .. t6 t5 t4 t3 t2 t1 + psrlq mm1, 8 ; .. t6 t5 t4 t3 t2 t1 t0 + PRED8x8_LOWPASS mm6, mm4, mm2, mm1, mm5 + ; .. p11 p10 p9 + movq mm7, mm3 + punpcklbw mm3, mm0 ; p4 p3 p2 p1 + punpckhbw mm7, mm0 ; p8 p7 p6 p5 + movq mm1, mm7 + movq mm0, mm7 + movq mm4, mm7 + movq [r0+3*FDEC_STRIDE], mm3 + PALIGNR mm7, mm3, 2, mm5 + movq [r0+2*FDEC_STRIDE], mm7 + PALIGNR mm1, mm3, 4, mm5 + movq [r0+1*FDEC_STRIDE], mm1 + PALIGNR mm0, mm3, 6, mm5 + movq [r0+0*FDEC_STRIDE], mm0 + movq mm2, mm6 + movq mm3, mm6 + movq [r0-1*FDEC_STRIDE], mm4 + PALIGNR mm6, mm4, 2, mm5 + movq [r0-2*FDEC_STRIDE], mm6 + PALIGNR mm2, mm4, 4, mm5 + movq [r0-3*FDEC_STRIDE], mm2 + PALIGNR mm3, mm4, 6, mm5 + movq [r0-4*FDEC_STRIDE], mm3 + RET + +;----------------------------------------------------------------------------- +; void predict_8x8_hd_ssse3( uint8_t *src, uint8_t *edge ) +;----------------------------------------------------------------------------- +cglobal predict_8x8_hd_ssse3, 2,2 + add r0, 4*FDEC_STRIDE + movdqa xmm0, [r1] + movdqa xmm1, [r1+16] + movdqa xmm2, xmm1 + movdqa xmm3, xmm1 + palignr xmm1, xmm0, 7 + palignr xmm2, xmm0, 9 + palignr xmm3, xmm0, 8 + movdqa xmm4, xmm1 + pavgb xmm4, xmm3 + PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm5 + punpcklbw xmm4, xmm0 + movhlps xmm0, xmm4 + +%assign Y 3 +%rep 3 + movq [r0+(Y)*FDEC_STRIDE], xmm4 + movq [r0+(Y-4)*FDEC_STRIDE], xmm0 + psrldq xmm4, 2 + psrldq xmm0, 2 +%assign Y (Y-1) +%endrep + movq [r0+(Y)*FDEC_STRIDE], xmm4 + movq [r0+(Y-4)*FDEC_STRIDE], xmm0 + RET + +;----------------------------------------------------------------------------- +; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge ) +;----------------------------------------------------------------------------- +cglobal predict_8x8_hu_mmxext, 2,2 + movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7 + add r0, 4*FDEC_STRIDE + pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 + psllq mm1, 56 ; l7 .. .. .. .. .. .. .. + movq mm2, mm0 + psllw mm0, 8 + psrlw mm2, 8 + por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 + movq mm3, mm2 + movq mm4, mm2 + movq mm5, mm2 + psrlq mm2, 8 + psrlq mm3, 16 + por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1 + punpckhbw mm1, mm1 + por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2 + pavgb mm4, mm2 + PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6 + movq mm5, mm4 + punpcklbw mm4, mm1 ; p4 p3 p2 p1 + punpckhbw mm5, mm1 ; p8 p7 p6 p5 + movq mm6, mm5 + movq mm7, mm5 + movq mm0, mm5 + PALIGNR mm5, mm4, 2, mm1 + pshufw mm1, mm6, 11111001b + PALIGNR mm6, mm4, 4, mm2 + pshufw mm2, mm7, 11111110b + PALIGNR mm7, mm4, 6, mm3 + pshufw mm3, mm0, 11111111b + movq [r0-4*FDEC_STRIDE], mm4 + movq [r0-3*FDEC_STRIDE], mm5 + movq [r0-2*FDEC_STRIDE], mm6 + movq [r0-1*FDEC_STRIDE], mm7 + movq [r0+0*FDEC_STRIDE], mm0 + movq [r0+1*FDEC_STRIDE], mm1 + movq [r0+2*FDEC_STRIDE], mm2 + movq [r0+3*FDEC_STRIDE], mm3 + RET + +;----------------------------------------------------------------------------- +; void predict_8x8_hu_sse2( uint8_t *src, uint8_t *edge ) +;----------------------------------------------------------------------------- +cglobal predict_8x8_hu_sse2, 2,2 + movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7 + add r0, 4*FDEC_STRIDE + pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 + psllq mm1, 56 ; l7 .. .. .. .. .. .. .. + movq mm2, mm0 + psllw mm0, 8 + psrlw mm2, 8 + por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 + movq mm3, mm2 + movq mm4, mm2 + movq mm5, mm2 + psrlq mm2, 8 + psrlq mm3, 16 + por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1 + punpckhbw mm1, mm1 + por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2 + pavgb mm4, mm2 + PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6 + + movq2dq xmm0, mm4 + movq2dq xmm1, mm1 + punpcklbw xmm0, xmm1 + + movhlps xmm4, xmm0 + pshuflw xmm5, xmm4, 11111001b + pshuflw xmm6, xmm4, 11111110b + pshuflw xmm7, xmm4, 11111111b +%assign Y -4 +%rep 3 + movq [r0+Y*FDEC_STRIDE], xmm0 + psrldq xmm0, 2 +%assign Y (Y+1) +%endrep + movq [r0+Y*FDEC_STRIDE], xmm0 + movq [r0+0*FDEC_STRIDE], xmm4 + movq [r0+1*FDEC_STRIDE], xmm5 + movq [r0+2*FDEC_STRIDE], xmm6 + movq [r0+3*FDEC_STRIDE], xmm7 + RET + + ;----------------------------------------------------------------------------- ; void predict_8x8c_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index 34a98d6d..a5665cae 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -38,6 +38,8 @@ extern void predict_8x8c_h_mmxext( uint8_t *src ); extern void predict_8x8c_h_ssse3( uint8_t *src ); extern void predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] ); +extern void predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] ); +extern void predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] ); @@ -46,27 +48,45 @@ extern void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] ); +extern void predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] ); +extern void predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] ); +extern void predict_8x8_filter_mmxext ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters ); +extern void predict_8x8_filter_ssse3 ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters ); extern void predict_4x4_ddl_mmxext( uint8_t *src ); +extern void predict_4x4_ddr_mmxext( uint8_t *src ); extern void predict_4x4_vl_mmxext( uint8_t *src ); +extern void predict_4x4_vr_mmxext( uint8_t *src ); +extern void predict_4x4_vr_ssse3( uint8_t *src ); +extern void predict_4x4_hd_mmxext( uint8_t *src ); +extern void predict_4x4_hd_ssse3( uint8_t *src ); extern void predict_4x4_dc_mmxext( uint8_t *src ); +extern void predict_4x4_ddr_ssse3( uint8_t *src ); +extern void predict_4x4_hu_mmxext( uint8_t *src ); extern void predict_16x16_dc_top_sse2( uint8_t *src ); extern void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ); extern void predict_16x16_v_sse2( uint8_t *src ); extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ); +#define PREDICT_P_SUM(j,i)\ + H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\ + V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\ + #define PREDICT_16x16_P(name)\ static void predict_16x16_p_##name( uint8_t *src )\ {\ - int a, b, c, i;\ + int a, b, c;\ int H = 0;\ int V = 0;\ int i00;\ - for( i = 1; i <= 8; i++ )\ - {\ - H += i * ( src[7+i - FDEC_STRIDE ] - src[7-i - FDEC_STRIDE ] );\ - V += i * ( src[(7+i)*FDEC_STRIDE -1] - src[(7-i)*FDEC_STRIDE -1] );\ - }\ + PREDICT_P_SUM(7,1) \ + PREDICT_P_SUM(7,2) \ + PREDICT_P_SUM(7,3) \ + PREDICT_P_SUM(7,4) \ + PREDICT_P_SUM(7,5) \ + PREDICT_P_SUM(7,6) \ + PREDICT_P_SUM(7,7) \ + PREDICT_P_SUM(7,8) \ a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\ b = ( 5 * H + 32 ) >> 6;\ c = ( 5 * V + 32 ) >> 6;\ @@ -79,16 +99,15 @@ PREDICT_16x16_P( sse2 ) static void predict_8x8c_p_mmxext( uint8_t *src ) { - int a, b, c, i; + int a, b, c; int H = 0; int V = 0; int i00; - for( i = 1; i <= 4; i++ ) - { - H += i * ( src[3+i - FDEC_STRIDE] - src[3-i - FDEC_STRIDE] ); - V += i * ( src[(3+i)*FDEC_STRIDE -1] - src[(3-i)*FDEC_STRIDE -1] ); - } + PREDICT_P_SUM(3,1) + PREDICT_P_SUM(3,2) + PREDICT_P_SUM(3,3) + PREDICT_P_SUM(3,4) a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] ); b = ( 17 * H + 16 ) >> 5; @@ -521,7 +540,7 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] ) pf[I_PRED_CHROMA_H] = predict_8x8c_h_ssse3; } -void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] ) +void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter ) { if( !(cpu&X264_CPU_MMXEXT) ) return; @@ -531,6 +550,9 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] ) pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext; pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext; pf[I_PRED_8x8_VR] = predict_8x8_vr_mmxext; + pf[I_PRED_8x8_HD] = predict_8x8_hd_mmxext; + pf[I_PRED_8x8_HU] = predict_8x8_hu_mmxext; + *predict_8x8_filter = predict_8x8_filter_mmxext; #ifdef ARCH_X86 pf[I_PRED_8x8_DDL] = predict_8x8_ddl_mmxext; pf[I_PRED_8x8_DDR] = predict_8x8_ddr_mmxext; @@ -540,6 +562,11 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] ) pf[I_PRED_8x8_DDL] = predict_8x8_ddl_sse2; pf[I_PRED_8x8_VL] = predict_8x8_vl_sse2; pf[I_PRED_8x8_DDR] = predict_8x8_ddr_sse2; + pf[I_PRED_8x8_HU] = predict_8x8_hu_sse2; + if( !(cpu&X264_CPU_SSSE3) ) + return; + pf[I_PRED_8x8_HD] = predict_8x8_hd_ssse3; + *predict_8x8_filter = predict_8x8_filter_ssse3; } void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] ) @@ -554,7 +581,16 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] ) #endif if( !(cpu&X264_CPU_MMXEXT) ) return; + pf[I_PRED_4x4_VR] = predict_4x4_vr_mmxext; pf[I_PRED_4x4_DDL] = predict_4x4_ddl_mmxext; pf[I_PRED_4x4_VL] = predict_4x4_vl_mmxext; pf[I_PRED_4x4_DC] = predict_4x4_dc_mmxext; + pf[I_PRED_4x4_DDR] = predict_4x4_ddr_mmxext; + pf[I_PRED_4x4_HD] = predict_4x4_hd_mmxext; + pf[I_PRED_4x4_HU] = predict_4x4_hu_mmxext; + if( !(cpu&X264_CPU_SSSE3) ) + return; + pf[I_PRED_4x4_DDR] = predict_4x4_ddr_ssse3; + pf[I_PRED_4x4_VR] = predict_4x4_vr_ssse3; + pf[I_PRED_4x4_HD] = predict_4x4_hd_ssse3; } diff --git a/common/x86/predict.h b/common/x86/predict.h index 398f3076..70aff09f 100644 --- a/common/x86/predict.h +++ b/common/x86/predict.h @@ -27,6 +27,5 @@ void x264_predict_16x16_init_mmx ( int cpu, x264_predict_t pf[7] ); void x264_predict_8x8c_init_mmx ( int cpu, x264_predict_t pf[7] ); void x264_predict_4x4_init_mmx ( int cpu, x264_predict_t pf[12] ); -void x264_predict_8x8_init_mmx ( int cpu, x264_predict8x8_t pf[12] ); - +void x264_predict_8x8_init_mmx ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter ); #endif diff --git a/encoder/analyse.c b/encoder/analyse.c index b02b945f..62d51c5d 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -683,7 +683,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx ); predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max ); - x264_predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS ); + h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS ); if( b_merged_satd && i_max == 9 ) { diff --git a/encoder/encoder.c b/encoder/encoder.c index 3ef62f74..6aa69f12 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -727,7 +727,7 @@ x264_t *x264_encoder_open ( x264_param_t *param ) /* init CPU functions */ x264_predict_16x16_init( h->param.cpu, h->predict_16x16 ); x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c ); - x264_predict_8x8_init( h->param.cpu, h->predict_8x8 ); + x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter ); x264_predict_4x4_init( h->param.cpu, h->predict_4x4 ); if( !h->param.b_cabac ); x264_init_vlc_tables(); diff --git a/tools/checkasm.c b/tools/checkasm.c index 29ddadd5..b88bf402 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -228,6 +228,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) x264_predict_t predict_8x8c[4+3]; x264_predict_t predict_4x4[9+3]; x264_predict8x8_t predict_8x8[9+3]; + x264_predict_8x8_filter_t predict_8x8_filter; DECLARE_ALIGNED_16( uint8_t edge[33] ); uint16_t cost_mv[32]; int ret = 0, ok, used_asm; @@ -238,9 +239,9 @@ static int check_pixel( int cpu_ref, int cpu_new ) x264_pixel_init( cpu_new, &pixel_asm ); x264_predict_16x16_init( 0, predict_16x16 ); x264_predict_8x8c_init( 0, predict_8x8c ); - x264_predict_8x8_init( 0, predict_8x8 ); + x264_predict_8x8_init( 0, predict_8x8, &predict_8x8_filter ); x264_predict_4x4_init( 0, predict_4x4 ); - x264_predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); + predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); // maximize sum for( i=0; i<256; i++ ) @@ -1294,30 +1295,32 @@ static int check_intra( int cpu_ref, int cpu_new ) int ret = 0, ok = 1, used_asm = 0; int i; DECLARE_ALIGNED_16( uint8_t edge[33] ); + DECLARE_ALIGNED_16( uint8_t edge2[33] ); struct { x264_predict_t predict_16x16[4+3]; x264_predict_t predict_8x8c[4+3]; x264_predict8x8_t predict_8x8[9+3]; x264_predict_t predict_4x4[9+3]; + x264_predict_8x8_filter_t predict_8x8_filter; } ip_c, ip_ref, ip_a; x264_predict_16x16_init( 0, ip_c.predict_16x16 ); x264_predict_8x8c_init( 0, ip_c.predict_8x8c ); - x264_predict_8x8_init( 0, ip_c.predict_8x8 ); + x264_predict_8x8_init( 0, ip_c.predict_8x8, &ip_c.predict_8x8_filter ); x264_predict_4x4_init( 0, ip_c.predict_4x4 ); x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 ); x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c ); - x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8 ); + x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8, &ip_ref.predict_8x8_filter ); x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 ); x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 ); x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c ); - x264_predict_8x8_init( cpu_new, ip_a.predict_8x8 ); + x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter ); x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 ); - x264_predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); + ip_c.predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); #define INTRA_TEST( name, dir, w, ... ) \ if( ip_a.name[dir] != ip_ref.name[dir] )\ @@ -1361,6 +1364,23 @@ static int check_intra( int cpu_ref, int cpu_new ) for( i = 0; i < 12; i++ ) INTRA_TEST( predict_8x8, i, 8, edge ); + used_asm = 1; + set_func_name("intra_predict_8x8_filter"); + if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter ) + { + for( i = 0; i < 32; i++ ) + { + memcpy( edge2, edge, 33 ); + call_c(ip_c.predict_8x8_filter, buf1+48, edge, (i&24)>>1, i&7); + call_a(ip_a.predict_8x8_filter, buf1+48, edge2, (i&24)>>1, i&7); + if( memcmp( edge, edge2, 33 ) ) + { + fprintf( stderr, "predict_8x8_filter : [FAILED] %d %d\n", (i&24)>>1, i&7); + ok = 0; + } + } + } + report( "intra pred :" ); return ret; } -- 2.40.0