From: Janne Grunau Date: Tue, 12 Aug 2014 15:26:10 +0000 (+0200) Subject: aarch64: NEON asm for decimate_score X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=40d5db342b7f5198db9826a51f31e454bd208596;p=libx264 aarch64: NEON asm for decimate_score decimate_score15 and 16 are 60% faster, decimate_score64 is 4 times faster than C. --- diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S index 02b71b2a..ed9b3ca8 100644 --- a/common/aarch64/quant-a.S +++ b/common/aarch64/quant-a.S @@ -4,6 +4,7 @@ * Copyright (C) 2009-2014 x264 project * * Authors: David Conrad + * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -300,6 +301,118 @@ dequant_4x4_dc_rshift: ret endfunc +.macro decimate_score_1x size +function x264_decimate_score\size\()_neon, export=1 + ld1 {v0.8h,v1.8h}, [x0] + movrel x5, X(x264_decimate_table4) + movi v3.16b, #0x01 + sqxtn v0.8b, v0.8h + sqxtn2 v0.16b, v1.8h + abs v2.16b, v0.16b + cmeq v1.16b, v0.16b, #0 + cmhi v2.16b, v2.16b, v3.16b + shrn v1.8b, v1.8h, #4 + shrn v2.8b, v2.8h, #4 + fmov x2, d2 + fmov x1, d1 + cbnz x2, 9f + mvn x1, x1 + mov w0, #0 + cbz x1, 0f +.ifc \size, 15 + lsr x1, x1, #1 +.endif + rbit x1, x1 +1: + clz x3, x1 + lsr x6, x3, #2 + lsl x1, x1, x3 + ldrb w7, [x5, x6] + cbz x1, 2f + lsl x1, x1, #4 + add w0, w0, w7 + cbnz x1, 1b + ret +2: + add w0, w0, w7 +0: + ret +9: + mov w0, #9 + ret +endfunc +.endm + +decimate_score_1x 15 +decimate_score_1x 16 + +const mask64, align=6 + .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 + .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 +endconst + +function x264_decimate_score64_neon, export=1 + ld1 {v0.8h,v1.8h}, [x0], #32 + ld1 {v2.8h,v3.8h}, [x0], #32 + ld1 {v4.8h,v5.8h}, [x0], #32 + ld1 {v6.8h,v7.8h}, [x0] + movrel x6, mask64 + movi v31.16b, #0x01 + sqxtn v16.8b, v1.8h + sqxtn2 v16.16b, v0.8h + sqxtn v17.8b, v3.8h + sqxtn2 v17.16b, v2.8h + sqxtn v18.8b, v5.8h + sqxtn2 v18.16b, v4.8h + sqxtn v19.8b, v7.8h + sqxtn2 v19.16b, v6.8h + abs v4.16b, v16.16b + abs v5.16b, v17.16b + abs v6.16b, v18.16b + abs v7.16b, v19.16b + ld1 {v30.16b}, [x6] + cmeq v0.16b, v16.16b, #0 + cmeq v1.16b, v17.16b, #0 + cmeq v2.16b, v18.16b, #0 + cmeq v3.16b, v19.16b, #0 + umax v4.16b, v4.16b, v5.16b + umax v6.16b, v6.16b, v7.16b + and v0.16b, v0.16b, v30.16b + and v1.16b, v1.16b, v30.16b + and v2.16b, v2.16b, v30.16b + and v3.16b, v3.16b, v30.16b + umax v4.16b, v4.16b, v6.16b + addp v0.16b, v1.16b, v0.16b + addp v2.16b, v3.16b, v2.16b + cmhi v4.16b, v4.16b, v31.16b + addp v0.16b, v2.16b, v0.16b + shrn v4.8b, v4.8h, #4 + addp v0.16b, v0.16b, v0.16b + fmov x2, d4 + fmov x1, d0 + cbnz x2, 9f + mvn x1, x1 + mov w0, #0 + cbz x1, 0f + movrel x5, X(x264_decimate_table8) +1: + clz x3, x1 + lsl x1, x1, x3 + ldrb w7, [x5, x3] + cbz x1, 2f + lsl x1, x1, #1 + add w0, w0, w7 + cbnz x1, 1b + ret +2: + add w0, w0, w7 +0: + ret +9: + mov w0, #9 + ret +endfunc + // int coeff_last( int16_t *l ) function x264_coeff_last4_aarch64, export=1 ldr x2, [x0] diff --git a/common/aarch64/quant.h b/common/aarch64/quant.h index dfcac255..5a797c1a 100644 --- a/common/aarch64/quant.h +++ b/common/aarch64/quant.h @@ -4,6 +4,7 @@ * Copyright (C) 2005-2014 x264 project * * Authors: David Conrad + * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,6 +39,10 @@ void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +int x264_decimate_score15_neon( int16_t * ); +int x264_decimate_score16_neon( int16_t * ); +int x264_decimate_score64_neon( int16_t * ); + int x264_coeff_last4_aarch64( int16_t * ); int x264_coeff_last8_aarch64( int16_t * ); int x264_coeff_last15_neon( int16_t * ); diff --git a/common/quant.c b/common/quant.c index 31d8901d..d1b89c08 100644 --- a/common/quant.c +++ b/common/quant.c @@ -714,7 +714,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) #endif // HAVE_MMX #if HAVE_ALTIVEC - if( cpu&X264_CPU_ALTIVEC ) { + if( cpu&X264_CPU_ALTIVEC ) + { pf->quant_2x2_dc = x264_quant_2x2_dc_altivec; pf->quant_4x4_dc = x264_quant_4x4_dc_altivec; pf->quant_4x4 = x264_quant_4x4_altivec; @@ -754,6 +755,12 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_last4 = x264_coeff_last4_aarch64; pf->coeff_last8 = x264_coeff_last8_aarch64; } + if( cpu&X264_CPU_NEON ) + { + pf->decimate_score15 = x264_decimate_score15_neon; + pf->decimate_score16 = x264_decimate_score16_neon; + pf->decimate_score64 = x264_decimate_score64_neon; + } #endif #endif // HIGH_BIT_DEPTH pf->coeff_last[DCT_LUMA_DC] = pf->coeff_last[DCT_CHROMAU_DC] = pf->coeff_last[DCT_CHROMAV_DC] =