From: Loren Merritt Date: Tue, 11 Oct 2011 18:12:43 +0000 (+0000) Subject: Remove obsolete versions of intra_mbcmp_x3 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=50aaf8d84ac6fc78794b98cfe6a25440a09fbb82;p=libx264 Remove obsolete versions of intra_mbcmp_x3 intra_mbcmp_x3 is unnecessary if x9 exists (SSSE3 and onwards). --- diff --git a/common/pixel.c b/common/pixel.c index 601cb9c5..e7b9984f 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -1091,13 +1091,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3; - pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3; -#if ARCH_X86_64 - pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3; -#endif pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3; - if( cpu&X264_CPU_SHUFFLE_IS_FAST ) - pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) { INIT2( sad, _cache64_ssse3 ); @@ -1127,7 +1121,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) } pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4; - pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse4; } if( cpu&X264_CPU_AVX ) @@ -1149,17 +1142,12 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT5( ssd, _avx ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx; -#if ARCH_X86_64 - pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_avx; -#endif pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx; pixf->ssim_end4 = x264_pixel_ssim_end4_avx; - pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx; - pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx; } if( cpu&X264_CPU_XOP ) @@ -1175,9 +1163,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT5( ssd, _xop ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop; -#if ARCH_X86_64 - pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_xop; -#endif pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index e4192a1b..2234f979 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -1635,12 +1635,14 @@ cglobal pixel_sa8d_16x16, 4,7 paddw %3, %5 %endmacro +; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+, +; and are only retained for old cpus. %macro INTRA_SA8D_SSE2 0 %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res ) ;----------------------------------------------------------------------------- -cglobal intra_sa8d_x3_8x8, 3,3,16 +cglobal intra_sa8d_x3_8x8, 3,3,14 ; 8x8 hadamard pxor m8, m8 movq m0, [r0+0*FENC_STRIDE] @@ -1667,23 +1669,15 @@ cglobal intra_sa8d_x3_8x8, 3,3,16 paddusw m8, m10 paddusw m9, m11 ABSW2 m10, m11, m6, m7, m6, m7 - ABSW m15, m1, m1 + ABSW m13, m1, m1 paddusw m10, m11 paddusw m8, m9 - paddusw m15, m10 - paddusw m15, m8 + paddusw m13, m10 + paddusw m13, m8 ; 1D hadamard of edges movq m8, [r1+7] movq m9, [r1+16] -%if cpuflag(ssse3) - punpcklwd m8, m8 - pshufb m9, [intrax3_shuf] - pmaddubsw m8, [pb_pppm] - pmaddubsw m9, [pb_pppm] - HSUMSUB2 psignw, m8, m9, m10, m11, m9, q1032, [pw_ppppmmmm] - HSUMSUB2 psignw, m8, m9, m10, m11, m9, q2301, [pw_ppmmppmm] -%else ; sse2 pxor m10, m10 punpcklbw m8, m10 punpcklbw m9, m10 @@ -1697,7 +1691,6 @@ cglobal intra_sa8d_x3_8x8, 3,3,16 pmullw m11, [pw_pmpmpmpm] paddw m8, m10 paddw m9, m11 -%endif ; differences paddw m10, m8, m9 @@ -1709,8 +1702,8 @@ cglobal intra_sa8d_x3_8x8, 3,3,16 psubw m8, m0 psubw m10, m0 ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum - paddusw m14, m8, m15 - paddusw m15, m10 + paddusw m8, m13 + paddusw m13, m10 punpcklwd m0, m1 punpcklwd m2, m3 punpcklwd m4, m5 @@ -1719,44 +1712,29 @@ cglobal intra_sa8d_x3_8x8, 3,3,16 punpckldq m4, m6 punpcklqdq m0, m4 ; transpose psllw m9, 3 ; top edge - psrldq m2, m15, 2 ; 8x7 sum + psrldq m2, m13, 2 ; 8x7 sum psubw m0, m9 ; 8x1 sum ABSW m0, m0, m9 paddusw m2, m0 ; 3x HADDW -%if cpuflag(xop) - phaddw m2, m14 - vphadduwq m0, m15 - movhlps m1, m0 - vphadduwq m2, m2 ; i8x8_v, i8x8_h - paddd m0, m1 ; i8x8_dc - packusdw m2, m0 ; i8x8_v, i8x8_h, i8x8_dc - pxor m3, m3 - psrlw m2, 1 - pavgw m2, m3 - movq [r2], m2 ; i8x8_v, i8x8_h - psrldq m2, 8 - movd [r2+8], m2 ; i8x8_dc -%else movdqa m7, [pw_1] pmaddwd m2, m7 - pmaddwd m14, m7 - pmaddwd m15, m7 - punpckhdq m3, m2, m14 - punpckldq m2, m14 - pshufd m5, m15, q3311 + pmaddwd m8, m7 + pmaddwd m13, m7 + punpckhdq m3, m2, m8 + punpckldq m2, m8 + pshufd m5, m13, q3311 paddd m2, m3 - paddd m5, m15 - punpckhqdq m3, m2, m5 + paddd m5, m13 + punpckhqdq m0, m2, m5 punpcklqdq m2, m5 - pavgw m3, m2 - pxor m0, m0 - pavgw m3, m0 - movq [r2], m3 ; i8x8_v, i8x8_h - psrldq m3, 8 - movd [r2+8], m3 ; i8x8_dc -%endif + pavgw m0, m2 + pxor m1, m1 + pavgw m0, m1 + movq [r2], m0 ; i8x8_v, i8x8_h + psrldq m0, 8 + movd [r2+8], m0 ; i8x8_dc RET %endif ; ARCH_X86_64 %endmacro ; INTRA_SA8D_SSE2 @@ -3714,7 +3692,6 @@ INTRA8_X9 %undef movdqu ; movups %undef punpcklqdq ; or movlhps %ifndef HIGH_BIT_DEPTH -INTRA_SA8D_SSE2 INIT_MMX ssse3 INTRA_X3_MMX %endif @@ -3734,7 +3711,6 @@ INIT_XMM avx SATDS_SSE2 SA8D %ifndef HIGH_BIT_DEPTH -INTRA_SA8D_SSE2 INTRA_X9 INTRA8_X9 %endif @@ -3745,7 +3721,6 @@ INIT_XMM xop SATDS_SSE2 SA8D %ifndef HIGH_BIT_DEPTH -INTRA_SA8D_SSE2 INTRA_X9 ; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why. %endif diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 1ea02897..b682efe0 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -97,10 +97,7 @@ DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, int i_stride )) void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * ); -void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * ); void x264_intra_sad_x3_4x4_mmx2 ( pixel *, pixel *, int * ); -void x264_intra_sad_x3_4x4_sse4 ( uint8_t *, uint8_t *, int * ); -void x264_intra_sad_x3_4x4_avx ( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_8x8c_mmx2 ( pixel *, pixel *, int * ); void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * ); void x264_intra_sad_x3_8x8c_mmx2 ( pixel *, pixel *, int * ); @@ -113,13 +110,8 @@ void x264_intra_sad_x3_16x16_sse2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_16x16_ssse3 ( pixel *, pixel *, int * ); void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * ); void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * ); -void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * ); -void x264_intra_sa8d_x3_8x8_avx ( uint8_t *, uint8_t *, int * ); -void x264_intra_sa8d_x3_8x8_xop ( uint8_t *, uint8_t *, int * ); void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * ); -void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * ); -void x264_intra_sad_x3_8x8_avx ( pixel *, pixel *, int * ); int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * ); int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * ); int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * ); diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index 2453e4c3..d7e76bb4 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -29,19 +29,6 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA - -h4x4_pred_shuf: db 3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15 -h4x4_pred_shuf2: db 3,7,11,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 -h8x8_pred_shuf: times 8 db 1 - times 8 db 0 - times 8 db 3 - times 8 db 2 - times 8 db 5 - times 8 db 4 - times 8 db 7 - times 8 db 6 - SECTION .text cextern pb_3 @@ -385,45 +372,6 @@ cglobal intra_sad_x3_4x4_mmx2, 3,3 movd [r2+4], mm1 ;H prediction cost RET -%macro INTRA_SADx3_4x4 0 -cglobal intra_sad_x3_4x4, 3,3 - movd xmm4, [r1+FDEC_STRIDE*0-4] - pinsrd xmm4, [r1+FDEC_STRIDE*1-4], 1 - pinsrd xmm4, [r1+FDEC_STRIDE*2-4], 2 - pinsrd xmm4, [r1+FDEC_STRIDE*3-4], 3 - movd xmm2, [r1-FDEC_STRIDE] - pxor xmm3, xmm3 - pshufb xmm5, xmm4, [h4x4_pred_shuf] ; EEEEFFFFGGGGHHHH - pshufb xmm4, [h4x4_pred_shuf2] ; EFGH - pshufd xmm0, xmm2, 0 ; ABCDABCDABCDABCD - punpckldq xmm2, xmm4 ; ABCDEFGH - psadbw xmm2, xmm3 - movd xmm1, [r0+FENC_STRIDE*0] - pinsrd xmm1, [r0+FENC_STRIDE*1], 1 - pinsrd xmm1, [r0+FENC_STRIDE*2], 2 - pinsrd xmm1, [r0+FENC_STRIDE*3], 3 - psadbw xmm0, xmm1 - psadbw xmm5, xmm1 - psraw xmm2, 2 - pavgw xmm2, xmm3 - pshufb xmm2, xmm3 ; DC prediction - punpckhqdq xmm3, xmm0, xmm5 - punpcklqdq xmm0, xmm5 - psadbw xmm2, xmm1 - paddw xmm0, xmm3 - movhlps xmm4, xmm2 - packusdw xmm0, xmm0 - paddw xmm2, xmm4 - movq [r2], xmm0 ; V/H prediction costs - movd [r2+8], xmm2 ; DC prediction cost - RET -%endmacro ; INTRA_SADx3_4x4 - -INIT_XMM sse4 -INTRA_SADx3_4x4 -INIT_XMM avx -INTRA_SADx3_4x4 - ;----------------------------------------------------------------------------- ; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]); ;----------------------------------------------------------------------------- @@ -491,69 +439,6 @@ cglobal intra_sad_x3_8x8_mmx2, 3,3 movd [r2+8], m1 RET -%macro INTRA_SADx3_8x8 0 -cglobal intra_sad_x3_8x8, 3,4,9 -%ifdef PIC - lea r11, [h8x8_pred_shuf] -%define shuf r11 -%else -%define shuf h8x8_pred_shuf -%endif - movq m0, [r1+7] ; left pixels - movq m1, [r1+16] ; top pixels - pxor m2, m2 - pxor m3, m3 - psadbw m2, m0 - psadbw m3, m1 - paddw m2, m3 - pxor m3, m3 ; V score accumulator - psraw m2, 3 - pavgw m2, m3 - punpcklqdq m1, m1 ; V prediction - pshufb m2, m3 ; DC prediction - pxor m4, m4 ; H score accumulator - pxor m5, m5 ; DC score accumulator - mov r3d, 6 -.loop: - movq m6, [r0+FENC_STRIDE*0] - movhps m6, [r0+FENC_STRIDE*1] - pshufb m7, m0, [shuf+r3*8] ; H prediction -%ifdef ARCH_X86_64 - psadbw m7, m6 - psadbw m8, m1, m6 - psadbw m6, m2 - paddw m4, m7 - paddw m3, m8 - paddw m5, m6 -%else - psadbw m7, m6 - paddw m4, m7 - psadbw m7, m1, m6 - psadbw m6, m2 - paddw m3, m7 - paddw m5, m6 -%endif - add r0, FENC_STRIDE*2 - sub r3d, 2 - jge .loop - - movhlps m0, m3 - movhlps m1, m4 - movhlps m2, m5 - paddw m3, m0 - paddw m4, m1 - paddw m5, m2 - movd [r2+0], m3 - movd [r2+4], m4 - movd [r2+8], m5 - RET -%endmacro ; INTRA_SADx3_8x8 - -INIT_XMM ssse3 -INTRA_SADx3_8x8 -INIT_XMM avx -INTRA_SADx3_8x8 - ;----------------------------------------------------------------------------- ; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] ); ;-----------------------------------------------------------------------------