From: Janne Grunau Date: Wed, 12 Mar 2014 23:05:48 +0000 (+0100) Subject: arm: use available neon functions for intra_sa8d/sad/satd_x3 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=00a00ccab316de3d50da6a82ba4af44dcb4655ec;p=libx264 arm: use available neon functions for intra_sa8d/sad/satd_x3 4% faster on main/medium, 15% faster on baseline/superfast on a cortex-a9. --- diff --git a/common/arm/predict.h b/common/arm/predict.h index 0d07c2a2..6cf2f5f3 100644 --- a/common/arm/predict.h +++ b/common/arm/predict.h @@ -26,6 +26,16 @@ #ifndef X264_ARM_PREDICT_H #define X264_ARM_PREDICT_H +void x264_predict_8x8_v_neon( pixel *src, pixel edge[36] ); +void x264_predict_8x8_h_neon( pixel *src, pixel edge[36] ); +void x264_predict_8x8_dc_neon( pixel *src, pixel edge[36] ); +void x264_predict_8x8c_dc_neon( pixel *src ); +void x264_predict_8x8c_h_neon( pixel *src ); +void x264_predict_8x8c_v_neon( pixel *src ); +void x264_predict_16x16_v_neon( pixel *src ); +void x264_predict_16x16_h_neon( pixel *src ); +void x264_predict_16x16_dc_neon( pixel *src ); + void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ); void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ); void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] ); diff --git a/common/pixel.c b/common/pixel.c index f62e2b38..fd1092a2 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -36,6 +36,7 @@ #endif #if ARCH_ARM # include "arm/pixel.h" +# include "arm/predict.h" #endif #if ARCH_UltraSPARC # include "sparc/pixel.h" @@ -532,6 +533,10 @@ INTRA_MBCMP_8x8(sa8d,, _c ) INTRA_MBCMP_8x8( sad, _mmx2, _c ) INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 ) #endif +#if !HIGH_BIT_DEPTH && HAVE_ARMV6 +INTRA_MBCMP_8x8( sad, _neon, _neon ) +INTRA_MBCMP_8x8(sa8d, _neon, _neon ) +#endif #define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\ void x264_intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\ @@ -587,6 +592,16 @@ INTRA_MBCMP(satd, 8x16, dc, h, v, c, _avx, _mmx2 ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _xop, _mmx2 ) #endif #endif +#if !HIGH_BIT_DEPTH && HAVE_ARMV6 +INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _c ) +INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _c ) +INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon ) +INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon ) +INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c ) +INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c ) +INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon ) +INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon ) +#endif // No C implementation of intra_satd_x9. See checkasm for its behavior, // or see x264_mb_analyse_intra for the entirely different algorithm we @@ -1352,6 +1367,17 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; + pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon; + pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon; + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon; + pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_neon; + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_neon; + pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_neon; + pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_neon; + pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_neon; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon; + pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon; + pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; pixf->ssim_end4 = x264_pixel_ssim_end4_neon;