From: Janne Grunau Date: Sun, 16 Mar 2014 16:21:58 +0000 (+0100) Subject: arm: x264_coeff_last8_arm X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=435722c9c1870cd54fdb89be39250d492aecb598;p=libx264 arm: x264_coeff_last8_arm checkasm --bench on a coretex-a9: coeff_last8_c: 173 coeff_last8_armv6: 151 60 instead of 73 cycles in ~130k runs on the same cpu while encoding. --- diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c index b9ad2623..08da8e5c 100644 --- a/common/arm/predict-c.c +++ b/common/arm/predict-c.c @@ -27,36 +27,6 @@ #include "predict.h" #include "pixel.h" -void x264_predict_4x4_dc_armv6( uint8_t *src ); -void x264_predict_4x4_dc_top_neon( uint8_t *src ); -void x264_predict_4x4_h_armv6( uint8_t *src ); -void x264_predict_4x4_ddr_armv6( uint8_t *src ); -void x264_predict_4x4_ddl_neon( uint8_t *src ); - -void x264_predict_8x8c_dc_neon( uint8_t *src ); -void x264_predict_8x8c_dc_top_neon( uint8_t *src ); -void x264_predict_8x8c_dc_left_neon( uint8_t *src ); -void x264_predict_8x8c_h_neon( uint8_t *src ); -void x264_predict_8x8c_v_neon( uint8_t *src ); -void x264_predict_8x8c_p_neon( uint8_t *src ); - -void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); - -void x264_predict_16x16_dc_neon( uint8_t *src ); -void x264_predict_16x16_dc_top_neon( uint8_t *src ); -void x264_predict_16x16_dc_left_neon( uint8_t *src ); -void x264_predict_16x16_h_neon( uint8_t *src ); -void x264_predict_16x16_v_neon( uint8_t *src ); -void x264_predict_16x16_p_neon( uint8_t *src ); - void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ) { if (!(cpu&X264_CPU_ARMV6)) diff --git a/common/arm/predict.h b/common/arm/predict.h index 26e1e93e..7c7acfc1 100644 --- a/common/arm/predict.h +++ b/common/arm/predict.h @@ -27,17 +27,35 @@ #define X264_ARM_PREDICT_H void x264_predict_4x4_dc_armv6( uint8_t *src ); +void x264_predict_4x4_dc_top_neon( uint8_t *src ); void x264_predict_4x4_v_armv6( uint8_t *src ); void x264_predict_4x4_h_armv6( uint8_t *src ); -void x264_predict_8x8_v_neon( pixel *src, pixel edge[36] ); -void x264_predict_8x8_h_neon( pixel *src, pixel edge[36] ); -void x264_predict_8x8_dc_neon( pixel *src, pixel edge[36] ); -void x264_predict_8x8c_dc_neon( pixel *src ); -void x264_predict_8x8c_h_neon( pixel *src ); -void x264_predict_8x8c_v_neon( pixel *src ); -void x264_predict_16x16_v_neon( pixel *src ); -void x264_predict_16x16_h_neon( pixel *src ); -void x264_predict_16x16_dc_neon( pixel *src ); +void x264_predict_4x4_ddr_armv6( uint8_t *src ); +void x264_predict_4x4_ddl_neon( uint8_t *src ); + +void x264_predict_8x8c_dc_neon( uint8_t *src ); +void x264_predict_8x8c_dc_top_neon( uint8_t *src ); +void x264_predict_8x8c_dc_left_neon( uint8_t *src ); +void x264_predict_8x8c_h_neon( uint8_t *src ); +void x264_predict_8x8c_v_neon( uint8_t *src ); +void x264_predict_8x8c_p_neon( uint8_t *src ); + +void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); + +void x264_predict_16x16_dc_neon( uint8_t *src ); +void x264_predict_16x16_dc_top_neon( uint8_t *src ); +void x264_predict_16x16_dc_left_neon( uint8_t *src ); +void x264_predict_16x16_h_neon( uint8_t *src ); +void x264_predict_16x16_v_neon( uint8_t *src ); +void x264_predict_16x16_p_neon( uint8_t *src ); void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ); void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ); diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S index 2aeedc4a..b8c6ba33 100644 --- a/common/arm/quant-a.S +++ b/common/arm/quant-a.S @@ -321,6 +321,20 @@ function x264_coeff_last4_arm bx lr .endfunc +function x264_coeff_last8_arm + ldrd r2, r3, [r0, #8] + orrs ip, r2, r3 + movne r0, #4 + ldrdeq r2, r3, [r0] + moveq r0, #0 + tst r3, r3 + addne r0, #2 + movne r2, r3 + lsrs r2, r2, #16 + addne r0, r0, #1 + bx lr +.endfunc + .macro COEFF_LAST_1x size function x264_coeff_last\size\()_neon .if \size == 15 diff --git a/common/arm/quant.h b/common/arm/quant.h index 0695ab1e..75d9fb28 100644 --- a/common/arm/quant.h +++ b/common/arm/quant.h @@ -39,6 +39,7 @@ void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp ); int x264_coeff_last4_arm( int16_t * ); +int x264_coeff_last8_arm( int16_t * ); int x264_coeff_last15_neon( int16_t * ); int x264_coeff_last16_neon( int16_t * ); int x264_coeff_last64_neon( int16_t * ); diff --git a/common/quant.c b/common/quant.c index 339df1c1..1a9e4dca 100644 --- a/common/quant.c +++ b/common/quant.c @@ -725,7 +725,10 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) #if HAVE_ARMV6 if( cpu&X264_CPU_ARMV6 ) + { pf->coeff_last4 = x264_coeff_last4_arm; + pf->coeff_last8 = x264_coeff_last8_arm; + } if( cpu&X264_CPU_NEON ) {