From: Loren Merritt Date: Sun, 2 Mar 2008 02:11:12 +0000 (+0000) Subject: sse2 16x16 intra pred. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=564cc252b099ad12e9c33dd9404ad64ed0bc5b8f;p=libx264 sse2 16x16 intra pred. port the remaining intra pred functions from x86_64 to x86_32. patch by Fiona Glaser. git-svn-id: svn://svn.videolan.org/x264/trunk@742 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/amd64/predict-a.asm b/common/amd64/predict-a.asm index d2cce53a..caecde3a 100644 --- a/common/amd64/predict-a.asm +++ b/common/amd64/predict-a.asm @@ -56,24 +56,31 @@ BITS 64 nop %endmacro +%macro STORE16x16_SSE2 1 + mov eax, 4 +.loop: + movdqa [parm1q + 0*FDEC_STRIDE], %1 + movdqa [parm1q + 1*FDEC_STRIDE], %1 + movdqa [parm1q + 2*FDEC_STRIDE], %1 + movdqa [parm1q + 3*FDEC_STRIDE], %1 + add parm1q, 4*FDEC_STRIDE + dec eax + jg .loop + nop +%endmacro SECTION_RODATA -pw_2: times 4 dw 2 -pw_4: times 4 dw 4 -pw_8: times 4 dw 8 -pw_3210: - dw 0 - dw 1 - dw 2 - dw 3 ALIGN 16 -pb_1: times 16 db 1 -pb_00s_ff: - times 8 db 0 -pb_0s_ff: - times 7 db 0 - db 0xff +pb_1: times 16 db 1 +pw_2: times 4 dw 2 +pw_4: times 4 dw 4 +pw_8: times 8 dw 8 +pw_76543210: +pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 +pb_00s_ff: times 8 db 0 +pb_0s_ff: times 7 db 0 + db 0xff ;============================================================================= ; Code @@ -460,7 +467,45 @@ ALIGN 4 nop ret - + +;----------------------------------------------------------------------------- +; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ) +;----------------------------------------------------------------------------- +cglobal predict_16x16_p_core_sse2 + movd xmm0, parm2d + movd xmm1, parm3d + movd xmm2, parm4d + pshuflw xmm0, xmm0, 0 + pshuflw xmm1, xmm1, 0 + pshuflw xmm2, xmm2, 0 + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + movdqa xmm3, xmm1 + pmullw xmm3, [pw_76543210 GLOBAL] + psllw xmm1, 3 + paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} + paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} + + mov eax, 16 +ALIGN 4 +.loop: + movdqa xmm3, xmm0 + movdqa xmm4, xmm1 + psraw xmm3, 5 + psraw xmm4, 5 + packuswb xmm3, xmm4 + movdqa [parm1q], xmm3 + + paddsw xmm0, xmm2 + paddsw xmm1, xmm2 + add parm1q, FDEC_STRIDE + dec eax + jg .loop + + nop + ret + ;----------------------------------------------------------------------------- ; void predict_16x16_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- @@ -470,6 +515,14 @@ cglobal predict_16x16_v_mmx STORE16x16 mm0, mm1 ret +;----------------------------------------------------------------------------- +; void predict_16x16_v_sse2( uint8_t *src ) +;----------------------------------------------------------------------------- +cglobal predict_16x16_v_sse2 + movdqa xmm0, [parm1q - FDEC_STRIDE] + STORE16x16_SSE2 xmm0 + ret + ;----------------------------------------------------------------------------- ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ) ;----------------------------------------------------------------------------- @@ -496,3 +549,29 @@ cglobal predict_16x16_dc_top_mmxext PRED16x16_DC [pw_8 GLOBAL], 4 ret +;----------------------------------------------------------------------------- +; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ) +;----------------------------------------------------------------------------- + +%macro PRED16x16_DC_SSE2 2 + pxor xmm0, xmm0 + psadbw xmm0, [parm1q - FDEC_STRIDE] + movhlps xmm1, xmm0 + paddw xmm0, xmm1 + paddusw xmm0, %1 + psrlw xmm0, %2 ; dc + pshuflw xmm0, xmm0, 0 + punpcklqdq xmm0, xmm0 + packuswb xmm0, xmm0 ; dc in bytes + STORE16x16_SSE2 xmm0 +%endmacro + +cglobal predict_16x16_dc_core_sse2 + movd xmm2, parm2d + PRED16x16_DC_SSE2 xmm2, 5 + ret + +cglobal predict_16x16_dc_top_sse2 + PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4 + ret + diff --git a/common/i386/predict-a.asm b/common/i386/predict-a.asm index baafd68e..6dc71114 100644 --- a/common/i386/predict-a.asm +++ b/common/i386/predict-a.asm @@ -56,18 +56,31 @@ BITS 32 nop %endmacro +%macro STORE16x16_SSE2 1 + mov eax, 4 +.loop: + movdqa [edx + 0*FDEC_STRIDE], %1 + movdqa [edx + 1*FDEC_STRIDE], %1 + movdqa [edx + 2*FDEC_STRIDE], %1 + movdqa [edx + 3*FDEC_STRIDE], %1 + add edx, 4*FDEC_STRIDE + dec eax + jg .loop + nop +%endmacro + SECTION_RODATA -ALIGN 8 -pw_2: times 4 dw 2 -pw_4: times 4 dw 4 -pw_8: times 4 dw 8 -pb_1: times 8 db 1 -pw_3210: - dw 0 - dw 1 - dw 2 - dw 3 +ALIGN 16 +pb_1: times 16 db 1 +pw_2: times 4 dw 2 +pw_4: times 4 dw 4 +pw_8: times 8 dw 8 +pw_76543210: +pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 +pb_00s_ff: times 8 db 0 +pb_0s_ff: times 7 db 0 + db 0xff ;============================================================================= ; Code @@ -77,15 +90,69 @@ SECTION .text ; dest, left, right, src, tmp ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 -%macro PRED8x8_LOWPASS 5 - movq %5, %2 +; dest, left, right, src, tmp +; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 +%macro PRED8x8_LOWPASS0 6 + mov%6 %5, %2 pavgb %2, %3 pxor %3, %5 - movq %1, %4 + mov%6 %1, %4 pand %3, [pb_1 GOT_ecx] psubusb %2, %3 pavgb %1, %2 %endmacro +%macro PRED8x8_LOWPASS 5 + PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, q +%endmacro +%macro PRED8x8_LOWPASS_XMM 5 + PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa +%endmacro + + +;----------------------------------------------------------------------------- +; void predict_4x4_ddl_mmxext( uint8_t *src ) +;----------------------------------------------------------------------------- +cglobal predict_4x4_ddl_mmxext + mov eax, [esp + 4] + picgetgot ecx + movq mm3, [eax - FDEC_STRIDE ] + movq mm1, [eax - FDEC_STRIDE - 1] + movq mm2, mm3 + movq mm4, [pb_0s_ff GOT_ecx] + psrlq mm2, 8 + pand mm4, mm3 + por mm2, mm4 + PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5 +%assign Y 0 +%rep 4 + psrlq mm0, 8 + movd [eax + Y * FDEC_STRIDE], mm0 +%assign Y (Y+1) +%endrep + ret + +;----------------------------------------------------------------------------- +; void predict_4x4_vl_mmxext( uint8_t *src ) +;----------------------------------------------------------------------------- +cglobal predict_4x4_vl_mmxext + mov eax, [esp + 4] + picgetgot ecx + movq mm1, [eax - FDEC_STRIDE] + movq mm3, mm1 + movq mm2, mm1 + psrlq mm3, 8 + psrlq mm2, 16 + movq mm4, mm3 + pavgb mm4, mm1 + PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5 + movd [eax + 0*FDEC_STRIDE], mm4 + movd [eax + 1*FDEC_STRIDE], mm0 + psrlq mm4, 8 + psrlq mm0, 8 + movd [eax + 2*FDEC_STRIDE], mm4 + movd [eax + 3*FDEC_STRIDE], mm0 + + ret ;----------------------------------------------------------------------------- @@ -370,6 +437,47 @@ ALIGN 4 nop ret +;----------------------------------------------------------------------------- +; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ) +;----------------------------------------------------------------------------- +cglobal predict_16x16_p_core_sse2 + picgetgot ecx + + mov edx, [esp + 4 ] + movd xmm0, [esp + 8 ] + movd xmm1, [esp + 12] + movd xmm2, [esp + 16] + pshuflw xmm0, xmm0, 0 + pshuflw xmm1, xmm1, 0 + pshuflw xmm2, xmm2, 0 + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + movdqa xmm3, xmm1 + pmullw xmm3, [pw_76543210 GOT_ecx] + psllw xmm1, 3 + paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} + paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} + + mov eax, 16 +ALIGN 4 +.loop: + movdqa xmm3, xmm0 + movdqa xmm4, xmm1 + psraw xmm3, 5 + psraw xmm4, 5 + packuswb xmm3, xmm4 + movdqa [edx], xmm3 + + paddsw xmm0, xmm2 + paddsw xmm1, xmm2 + add edx, FDEC_STRIDE + dec eax + jg .loop + + nop + ret + ;----------------------------------------------------------------------------- ; void predict_16x16_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- @@ -380,6 +488,15 @@ cglobal predict_16x16_v_mmx STORE16x16 mm0, mm1 ret +;----------------------------------------------------------------------------- +; void predict_16x16_v_sse2( uint8_t *src ) +;----------------------------------------------------------------------------- +cglobal predict_16x16_v_sse2 + mov edx, [esp + 4] + movdqa xmm0, [edx - FDEC_STRIDE] + STORE16x16_SSE2 xmm0 + ret + ;----------------------------------------------------------------------------- ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ) ;----------------------------------------------------------------------------- @@ -407,3 +524,106 @@ cglobal predict_16x16_dc_top_mmxext PRED16x16_DC [pw_8 GOT_ecx], 4 ret +;----------------------------------------------------------------------------- +; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ) +;----------------------------------------------------------------------------- + +%macro PRED16x16_DC_SSE2 2 + mov edx, [esp+4] + pxor xmm0, xmm0 + psadbw xmm0, [edx - FDEC_STRIDE] + movhlps xmm1, xmm0 + paddw xmm0, xmm1 + paddusw xmm0, %1 + psrlw xmm0, %2 ; dc + pshuflw xmm0, xmm0, 0 + punpcklqdq xmm0, xmm0 + packuswb xmm0, xmm0 ; dc in bytes + STORE16x16_SSE2 xmm0 +%endmacro + +cglobal predict_16x16_dc_core_sse2 + movd xmm2, [esp+8] + PRED16x16_DC_SSE2 xmm2, 5 + ret + +cglobal predict_16x16_dc_top_sse2 + picgetgot ecx + PRED16x16_DC_SSE2 [pw_8 GOT_ecx], 4 + ret + +;----------------------------------------------------------------------------- +; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge ) +;----------------------------------------------------------------------------- +cglobal predict_8x8_ddr_sse2 + mov edx, [esp + 8] + mov eax, [esp + 4] + picgetgot ecx + movdqu xmm3, [edx + 8] + movdqu xmm1, [edx + 7] + movdqa xmm2, xmm3 + psrldq xmm2, 1 + PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 + movdqa xmm1, xmm0 + psrldq xmm1, 1 +%assign Y 7 +%rep 3 + movq [eax + Y * FDEC_STRIDE], xmm0 + movq [eax + (Y-1) * FDEC_STRIDE], xmm1 + psrldq xmm0, 2 + psrldq xmm1, 2 +%assign Y (Y-2) +%endrep + movq [eax + 1 * FDEC_STRIDE], xmm0 + movq [eax + 0 * FDEC_STRIDE], xmm1 + ret + +;----------------------------------------------------------------------------- +; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge ) +;----------------------------------------------------------------------------- +cglobal predict_8x8_ddl_sse2 + mov edx, [esp + 8] + mov eax, [esp + 4] + picgetgot ecx + movdqa xmm3, [edx + 16] + movdqu xmm2, [edx + 17] + movdqa xmm1, xmm3 + pslldq xmm1, 1 + PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 +%assign Y 0 +%rep 8 + psrldq xmm0, 1 + movq [eax + Y * FDEC_STRIDE], xmm0 +%assign Y (Y+1) +%endrep + ret + +;----------------------------------------------------------------------------- +; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge ) +;----------------------------------------------------------------------------- +cglobal predict_8x8_vl_sse2 + mov edx, [esp + 8] + mov eax, [esp + 4] + picgetgot ecx + movdqa xmm4, [edx + 16] + movdqa xmm2, xmm4 + movdqa xmm1, xmm4 + movdqa xmm3, xmm4 + psrldq xmm2, 1 + pslldq xmm1, 1 + pavgb xmm3, xmm2 + PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5 +; xmm0: (t0 + 2*t1 + t2 + 2) >> 2 +; xmm3: (t0 + t1 + 1) >> 1 +%assign Y 0 +%rep 3 + psrldq xmm0, 1 + movq [eax + Y * FDEC_STRIDE], xmm3 + movq [eax + (Y+1) * FDEC_STRIDE], xmm0 + psrldq xmm3, 1 +%assign Y (Y+2) +%endrep + psrldq xmm0, 1 + movq [eax + Y * FDEC_STRIDE], xmm3 + movq [eax + (Y+1) * FDEC_STRIDE], xmm0 + ret diff --git a/common/i386/predict-c.c b/common/i386/predict-c.c index 73c7e608..956a4c17 100644 --- a/common/i386/predict-c.c +++ b/common/i386/predict-c.c @@ -45,28 +45,33 @@ extern void predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] ); extern void predict_4x4_ddl_mmxext( uint8_t *src ); extern void predict_4x4_vl_mmxext( uint8_t *src ); - -static void predict_16x16_p( uint8_t *src ) -{ - int a, b, c, i; - int H = 0; - int V = 0; - int i00; - - for( i = 1; i <= 8; i++ ) - { - H += i * ( src[7+i - FDEC_STRIDE ] - src[7-i - FDEC_STRIDE ] ); - V += i * ( src[(7+i)*FDEC_STRIDE -1] - src[(7-i)*FDEC_STRIDE -1] ); - } - - a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] ); - b = ( 5 * H + 32 ) >> 6; - c = ( 5 * V + 32 ) >> 6; - i00 = a - b * 7 - c * 7 + 16; - - predict_16x16_p_core_mmxext( src, i00, b, c ); +extern void predict_16x16_dc_top_sse2( uint8_t *src ); +extern void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ); +extern void predict_16x16_v_sse2( uint8_t *src ); +extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ); + +#define PREDICT_16x16_P(name)\ +static void predict_16x16_p_##name( uint8_t *src )\ +{\ + int a, b, c, i;\ + int H = 0;\ + int V = 0;\ + int i00;\ + for( i = 1; i <= 8; i++ )\ + {\ + H += i * ( src[7+i - FDEC_STRIDE ] - src[7-i - FDEC_STRIDE ] );\ + V += i * ( src[(7+i)*FDEC_STRIDE -1] - src[(7-i)*FDEC_STRIDE -1] );\ + }\ + a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\ + b = ( 5 * H + 32 ) >> 6;\ + c = ( 5 * V + 32 ) >> 6;\ + i00 = a - b * 7 - c * 7 + 16;\ + predict_16x16_p_core_##name( src, i00, b, c );\ } +PREDICT_16x16_P( mmxext ) +PREDICT_16x16_P( sse2 ) + static void predict_8x8c_p( uint8_t *src ) { int a, b, c, i; @@ -88,20 +93,22 @@ static void predict_8x8c_p( uint8_t *src ) predict_8x8c_p_core_mmxext( src, i00, b, c ); } -static void predict_16x16_dc( uint8_t *src ) -{ - uint32_t dc=16; - int i; - - for( i = 0; i < 16; i+=2 ) - { - dc += src[-1 + i * FDEC_STRIDE]; - dc += src[-1 + (i+1) * FDEC_STRIDE]; - } - - predict_16x16_dc_core_mmxext( src, dc ); +#define PREDICT_16x16_DC(name)\ +static void predict_16x16_dc_##name( uint8_t *src )\ +{\ + uint32_t dc=16;\ + int i;\ + for( i = 0; i < 16; i+=2 )\ + {\ + dc += src[-1 + i * FDEC_STRIDE];\ + dc += src[-1 + (i+1) * FDEC_STRIDE];\ + }\ + predict_16x16_dc_core_##name( src, dc );\ } +PREDICT_16x16_DC( mmxext ) +PREDICT_16x16_DC( sse2 ) + static void predict_8x8c_dc( uint8_t *src ) { int s2 = 4 @@ -488,9 +495,9 @@ void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *fenc, uint8_t edge[33], int res[3] void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] ) { pf[I_PRED_16x16_V] = predict_16x16_v_mmx; - pf[I_PRED_16x16_DC] = predict_16x16_dc; + pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext; pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext; - pf[I_PRED_16x16_P] = predict_16x16_p; + pf[I_PRED_16x16_P] = predict_16x16_p_mmxext; #ifdef ARCH_X86_64 pf[I_PRED_16x16_H] = predict_16x16_h; @@ -526,19 +533,15 @@ void x264_predict_8x8_init_mmxext( x264_predict8x8_t pf[12] ) void x264_predict_8x8_init_sse2( x264_predict8x8_t pf[12] ) { -#ifdef ARCH_X86_64 // x86 not written yet pf[I_PRED_8x8_DDL] = predict_8x8_ddl_sse2; - pf[I_PRED_8x8_DDR] = predict_8x8_ddr_sse2; pf[I_PRED_8x8_VL] = predict_8x8_vl_sse2; -#endif + pf[I_PRED_8x8_DDR] = predict_8x8_ddr_sse2; } void x264_predict_4x4_init_mmxext( x264_predict_t pf[12] ) { -#ifdef ARCH_X86_64 // x86 not written yet pf[I_PRED_4x4_DDL] = predict_4x4_ddl_mmxext; pf[I_PRED_4x4_VL] = predict_4x4_vl_mmxext; -#endif #ifdef ARCH_X86_64 // slower on x86 pf[I_PRED_4x4_DDR] = predict_4x4_ddr; pf[I_PRED_4x4_VR] = predict_4x4_vr; @@ -547,3 +550,10 @@ void x264_predict_4x4_init_mmxext( x264_predict_t pf[12] ) #endif } +void x264_predict_16x16_init_sse2 ( x264_predict_t pf[7] ) +{ + pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2; + pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2; + pf[I_PRED_16x16_V] = predict_16x16_v_sse2; + pf[I_PRED_16x16_P] = predict_16x16_p_sse2; +} diff --git a/common/i386/predict.h b/common/i386/predict.h index f3512afc..49d892d0 100644 --- a/common/i386/predict.h +++ b/common/i386/predict.h @@ -29,5 +29,6 @@ void x264_predict_8x8c_init_mmxext ( x264_predict_t pf[7] ); void x264_predict_4x4_init_mmxext ( x264_predict_t pf[12] ); void x264_predict_8x8_init_mmxext ( x264_predict8x8_t pf[12] ); void x264_predict_8x8_init_sse2 ( x264_predict8x8_t pf[12] ); +void x264_predict_16x16_init_sse2 ( x264_predict_t pf[7] ); #endif diff --git a/common/predict.c b/common/predict.c index 1c5b5a97..3b44723f 100644 --- a/common/predict.c +++ b/common/predict.c @@ -851,6 +851,11 @@ void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] ) { x264_predict_16x16_init_mmxext( pf ); } + // disable on AMD processors since it is slower + if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) ) + { + x264_predict_16x16_init_sse2( pf ); + } #endif #ifdef ARCH_PPC @@ -899,7 +904,8 @@ void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12] ) { x264_predict_8x8_init_mmxext( pf ); } - if( cpu&X264_CPU_SSE2 ) + // disable on AMD processors since it is slower + if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) ) { x264_predict_8x8_init_sse2( pf ); }