From: Janne Grunau Date: Tue, 29 Jul 2014 10:06:24 +0000 (+0100) Subject: aarch64: NEON asm for missing x264_zigzag_* functions X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6c1632493e5afac8be1e1693377dab27f4704a1d;p=libx264 aarch64: NEON asm for missing x264_zigzag_* functions zigzag_scan_4x4_field_neon, zigzag_sub_4x4_field_neon, zigzag_sub_4x4ac_field_neon, zigzag_sub_4x4_frame_neon, igzag_sub_4x4ac_frame_neon more than 2 times faster zigzag_scan_8x8_frame_neon, zigzag_scan_8x8_field_neon, zigzag_sub_8x8_field_neon, zigzag_sub_8x8_frame_neon 4-5 times faster zigzag_interleave_8x8_cavlc_neon 6 times faster --- diff --git a/common/aarch64/dct-a.S b/common/aarch64/dct-a.S index 7b54fbd0..aa12118e 100644 --- a/common/aarch64/dct-a.S +++ b/common/aarch64/dct-a.S @@ -1,9 +1,10 @@ /**************************************************************************** - * dct-a.S: AArch6464 transform and zigzag + * dct-a.S: aarch64 transform and zigzag ***************************************************************************** * Copyright (C) 2009-2014 x264 project * * Authors: David Conrad + * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -32,6 +33,25 @@ const scan4x4_frame, align=4 .byte 26,27, 28,29, 22,23, 30,31 endconst +const scan4x4_field, align=4 +.byte 0,1, 2,3, 8,9, 4,5 +.byte 6,7, 10,11, 12,13, 14,15 +endconst + +const sub4x4_frame, align=4 +.byte 0, 1, 4, 8 +.byte 5, 2, 3, 6 +.byte 9, 12, 13, 10 +.byte 7, 11, 14, 15 +endconst + +const sub4x4_field, align=4 +.byte 0, 4, 1, 8 +.byte 12, 5, 9, 13 +.byte 2, 6, 10, 14 +.byte 3, 7, 11, 15 +endconst + // sum = a + (b>>shift) sub = (a>>shift) - b .macro SUMSUB_SHR shift sum sub a b t0 t1 sshr \t0, \b, #\shift @@ -655,6 +675,35 @@ function x264_sub8x8_dct_dc_neon, export=1 ret endfunc +function x264_zigzag_interleave_8x8_cavlc_neon, export=1 + mov x3, #7 + movi v31.4s, #1 + ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64 + ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64 + umax v16.8h, v0.8h, v4.8h + umax v17.8h, v1.8h, v5.8h + umax v18.8h, v2.8h, v6.8h + umax v19.8h, v3.8h, v7.8h + st1 {v0.8h}, [x0], #16 + st1 {v4.8h}, [x0], #16 + umaxp v16.8h, v16.8h, v17.8h + umaxp v18.8h, v18.8h, v19.8h + st1 {v1.8h}, [x0], #16 + st1 {v5.8h}, [x0], #16 + umaxp v16.8h, v16.8h, v18.8h + st1 {v2.8h}, [x0], #16 + st1 {v6.8h}, [x0], #16 + cmhi v16.4s, v16.4s, v31.4s + st1 {v3.8h}, [x0], #16 + and v16.16b, v16.16b, v31.16b + st1 {v7.8h}, [x0], #16 + st1 {v16.b}[0], [x2], #1 + st1 {v16.b}[4], [x2], x3 + st1 {v16.b}[8], [x2], #1 + st1 {v16.b}[12], [x2] + ret +endfunc + function x264_zigzag_scan_4x4_frame_neon, export=1 movrel x2, scan4x4_frame ld1 {v0.16b,v1.16b}, [x1] @@ -664,3 +713,282 @@ function x264_zigzag_scan_4x4_frame_neon, export=1 st1 {v2.16b,v3.16b}, [x0] ret endfunc + +.macro zigzag_sub_4x4 f ac +function x264_zigzag_sub_4x4\ac\()_\f\()_neon, export=1 + mov x9, #FENC_STRIDE + mov x4, #FDEC_STRIDE + movrel x5, sub4x4_\f + mov x6, x2 + ld1 {v0.s}[0], [x1], x9 + ld1 {v0.s}[1], [x1], x9 + ld1 {v0.s}[2], [x1], x9 + ld1 {v0.s}[3], [x1], x9 + ld1 {v16.16b}, [x5] + ld1 {v1.s}[0], [x2], x4 + ld1 {v1.s}[1], [x2], x4 + ld1 {v1.s}[2], [x2], x4 + ld1 {v1.s}[3], [x2], x4 + tbl v2.16b, {v0.16b}, v16.16b + tbl v3.16b, {v1.16b}, v16.16b + st1 {v0.s}[0], [x6], x4 + usubl v4.8h, v2.8b, v3.8b +.ifc \ac, ac + dup h7, v4.h[0] + ins v4.h[0], wzr + fmov w5, s7 + strh w5, [x3] +.endif + usubl2 v5.8h, v2.16b, v3.16b + st1 {v0.s}[1], [x6], x4 + umax v6.8h, v4.8h, v5.8h + umaxv h6, v6.8h + st1 {v0.s}[2], [x6], x4 + fmov w7, s6 + st1 {v0.s}[3], [x6], x4 + cmp w7, #0 + st1 {v4.8h,v5.8h}, [x0] + cset w0, ne + ret +endfunc +.endm + +zigzag_sub_4x4 field +zigzag_sub_4x4 field, ac +zigzag_sub_4x4 frame +zigzag_sub_4x4 frame, ac + +function x264_zigzag_scan_4x4_field_neon, export=1 + movrel x2, scan4x4_field + ld1 {v0.8h,v1.8h}, [x1] + ld1 {v16.16b}, [x2] + tbl v0.16b, {v0.16b}, v16.16b + st1 {v0.8h,v1.8h}, [x0] + ret +endfunc + +function x264_zigzag_scan_8x8_frame_neon, export=1 + movrel x2, scan8x8_frame + ld1 {v0.8h,v1.8h}, [x1], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 + ld1 {v4.8h,v5.8h}, [x1], #32 + ld1 {v6.8h,v7.8h}, [x1] + ld1 {v16.16b,v17.16b}, [x2], #32 + ld1 {v18.16b,v19.16b}, [x2], #32 + ld1 {v20.16b,v21.16b}, [x2], #32 + ld1 {v22.16b,v23.16b}, [x2], #32 + tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b + tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b + tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b + tbl v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b + tbl v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b + tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b + tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b + tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b + mov v25.h[6], v4.h[0] + mov v25.h[7], v5.h[0] + mov v26.h[0], v4.h[1] + mov v27.h[4], v7.h[0] + mov v28.h[7], v4.h[4] + mov v29.h[7], v3.h[6] + mov v30.h[0], v2.h[7] + mov v30.h[1], v3.h[7] + st1 {v24.8h,v25.8h}, [x0], #32 + st1 {v26.8h,v27.8h}, [x0], #32 + st1 {v28.8h,v29.8h}, [x0], #32 + st1 {v30.8h,v31.8h}, [x0] + ret +endfunc + +#define Z(z) 2*(z), 2*(z)+1 +#define T(x,y) Z(x*8+y) +const scan8x8_frame, align=5 + .byte T(0,0), T(1,0), T(0,1), T(0,2) + .byte T(1,1), T(2,0), T(3,0), T(2,1) + .byte T(1,2), T(0,3), T(0,4), T(1,3) + .byte T(2,2), T(3,1), T(4,0), T(5,0) + .byte T(4,1), T(3,2), T(2,3), T(1,4) + .byte T(0,5), T(0,6), T(1,5), T(2,4) +#undef T +#define T(x,y) Z((x-3)*8+y) + .byte T(3,3), T(4,2), T(5,1), T(6,0) + .byte T(7,0), T(6,1), T(5,2), T(4,3) +#undef T +#define T(x,y) Z((x-0)*8+y) + .byte T(3,4), T(2,5), T(1,6), T(0,7) + .byte T(1,7), T(2,6), T(3,5), T(4,4) +#undef T +#define T(x,y) Z((x-4)*8+y) + .byte T(5,3), T(6,2), T(7,1), T(7,2) + .byte T(6,3), T(5,4), T(4,5), T(3,6) + .byte T(2,7), T(3,7), T(4,6), T(5,5) + .byte T(6,4), T(7,3), T(7,4), T(6,5) + .byte T(5,6), T(4,7), T(5,7), T(6,6) + .byte T(7,5), T(7,6), T(6,7), T(7,7) +endconst + +function x264_zigzag_scan_8x8_field_neon, export=1 + movrel x2, scan8x8_field + ld1 {v0.8h,v1.8h}, [x1], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 + ld1 {v4.8h,v5.8h}, [x1], #32 + ld1 {v6.8h,v7.8h}, [x1] + ld1 {v16.16b,v17.16b}, [x2], #32 + ld1 {v18.16b,v19.16b}, [x2], #32 + ld1 {v20.16b,v21.16b}, [x2], #32 + ld1 {v22.16b}, [x2] + ext v31.16b, v7.16b, v7.16b, #4 + tbl v24.16b, {v0.16b,v1.16b}, v16.16b + tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b + tbl v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b + tbl v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b + tbl v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b + tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b + tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b + ext v31.16b, v6.16b, v31.16b, #12 + st1 {v24.8h,v25.8h}, [x0], #32 + st1 {v26.8h,v27.8h}, [x0], #32 + st1 {v28.8h,v29.8h}, [x0], #32 + st1 {v30.8h,v31.8h}, [x0] + ret +endfunc + +.macro zigzag_sub8x8 f +function x264_zigzag_sub_8x8_\f\()_neon, export=1 + movrel x4, sub8x8_\f + mov x5, #FENC_STRIDE + mov x6, #FDEC_STRIDE + mov x7, x2 + ld1 {v0.d}[0], [x1], x5 + ld1 {v0.d}[1], [x1], x5 + ld1 {v1.d}[0], [x1], x5 + ld1 {v1.d}[1], [x1], x5 + ld1 {v2.d}[0], [x1], x5 + ld1 {v2.d}[1], [x1], x5 + ld1 {v3.d}[0], [x1], x5 + ld1 {v3.d}[1], [x1] + ld1 {v4.d}[0], [x2], x6 + ld1 {v4.d}[1], [x2], x6 + ld1 {v5.d}[0], [x2], x6 + ld1 {v5.d}[1], [x2], x6 + ld1 {v6.d}[0], [x2], x6 + ld1 {v6.d}[1], [x2], x6 + ld1 {v7.d}[0], [x2], x6 + ld1 {v7.d}[1], [x2] + ld1 {v16.16b,v17.16b}, [x4], #32 + ld1 {v18.16b,v19.16b}, [x4], #32 + tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b + tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b + tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b + tbl v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b + tbl v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b + tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b + tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b + tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b + usubl v4.8h, v24.8b, v28.8b + usubl2 v5.8h, v24.16b, v28.16b + usubl v6.8h, v25.8b, v29.8b + usubl2 v7.8h, v25.16b, v29.16b + usubl v16.8h, v26.8b, v30.8b + usubl2 v17.8h, v26.16b, v30.16b + usubl v18.8h, v27.8b, v31.8b + usubl2 v19.8h, v27.16b, v31.16b + umax v20.8h, v4.8h, v5.8h + umax v21.8h, v6.8h, v7.8h + umax v22.8h, v16.8h, v17.8h + umax v23.8h, v18.8h, v19.8h + umax v20.8h, v20.8h, v21.8h + umax v21.8h, v22.8h, v23.8h + umax v20.8h, v20.8h, v21.8h + umaxv h22, v20.8h + st1 {v0.d}[0], [x7], x6 + st1 {v0.d}[1], [x7], x6 + st1 {v1.d}[0], [x7], x6 + st1 {v1.d}[1], [x7], x6 + st1 {v2.d}[0], [x7], x6 + st1 {v2.d}[1], [x7], x6 + st1 {v3.d}[0], [x7], x6 + st1 {v3.d}[1], [x7] + st1 {v4.8h,v5.8h}, [x0], #32 + st1 {v6.8h,v7.8h}, [x0], #32 + st1 {v16.8h,v17.8h}, [x0], #32 + st1 {v18.8h,v19.8h}, [x0] + fmov w9, s22 + cmp w9, #0 + cset w0, ne + ret +endfunc +.endm + +zigzag_sub8x8 field +zigzag_sub8x8 frame + +#undef T +#define T(x,y) Z(x*8+y) +const scan8x8_field, align=5 + .byte T(0,0), T(0,1), T(0,2), T(1,0) + .byte T(1,1), T(0,3), T(0,4), T(1,2) + .byte T(2,0), T(1,3), T(0,5), T(0,6) + .byte T(0,7), T(1,4), T(2,1), T(3,0) +#undef T +#define T(x,y) Z((x-1)*8+y) + .byte T(2,2), T(1,5), T(1,6), T(1,7) + .byte T(2,3), T(3,1), T(4,0), T(3,2) +#undef T +#define T(x,y) Z((x-2)*8+y) + .byte T(2,4), T(2,5), T(2,6), T(2,7) + .byte T(3,3), T(4,1), T(5,0), T(4,2) +#undef T +#define T(x,y) Z((x-3)*8+y) + .byte T(3,4), T(3,5), T(3,6), T(3,7) + .byte T(4,3), T(5,1), T(6,0), T(5,2) +#undef T +#define T(x,y) Z((x-4)*8+y) + .byte T(4,4), T(4,5), T(4,6), T(4,7) + .byte T(5,3), T(6,1), T(6,2), T(5,4) +#undef T +#define T(x,y) Z((x-5)*8+y) + .byte T(5,5), T(5,6), T(5,7), T(6,3) + .byte T(7,0), T(7,1), T(6,4), T(6,5) +endconst + + +#undef T +#define T(y,x) x*8+y +const sub8x8_frame, align=5 + .byte T(0,0), T(1,0), T(0,1), T(0,2) + .byte T(1,1), T(2,0), T(3,0), T(2,1) + .byte T(1,2), T(0,3), T(0,4), T(1,3) + .byte T(2,2), T(3,1), T(4,0), T(5,0) + .byte T(4,1), T(3,2), T(2,3), T(1,4) + .byte T(0,5), T(0,6), T(1,5), T(2,4) + .byte T(3,3), T(4,2), T(5,1), T(6,0) + .byte T(7,0), T(6,1), T(5,2), T(4,3) + .byte T(3,4), T(2,5), T(1,6), T(0,7) + .byte T(1,7), T(2,6), T(3,5), T(4,4) + .byte T(5,3), T(6,2), T(7,1), T(7,2) + .byte T(6,3), T(5,4), T(4,5), T(3,6) + .byte T(2,7), T(3,7), T(4,6), T(5,5) + .byte T(6,4), T(7,3), T(7,4), T(6,5) + .byte T(5,6), T(4,7), T(5,7), T(6,6) + .byte T(7,5), T(7,6), T(6,7), T(7,7) +endconst + +const sub8x8_field, align=5 + .byte T(0,0), T(0,1), T(0,2), T(1,0) + .byte T(1,1), T(0,3), T(0,4), T(1,2) + .byte T(2,0), T(1,3), T(0,5), T(0,6) + .byte T(0,7), T(1,4), T(2,1), T(3,0) + .byte T(2,2), T(1,5), T(1,6), T(1,7) + .byte T(2,3), T(3,1), T(4,0), T(3,2) + .byte T(2,4), T(2,5), T(2,6), T(2,7) + .byte T(3,3), T(4,1), T(5,0), T(4,2) + .byte T(3,4), T(3,5), T(3,6), T(3,7) + .byte T(4,3), T(5,1), T(6,0), T(5,2) + .byte T(4,4), T(4,5), T(4,6), T(4,7) + .byte T(5,3), T(6,1), T(6,2), T(5,4) + .byte T(5,5), T(5,6), T(5,7), T(6,3) + .byte T(7,0), T(7,1), T(6,4), T(6,5) + .byte T(6,6), T(6,7), T(7,2), T(7,3) + .byte T(7,4), T(7,5), T(7,6), T(7,7) +endconst diff --git a/common/aarch64/dct.h b/common/aarch64/dct.h index 54c48b31..4af311c6 100644 --- a/common/aarch64/dct.h +++ b/common/aarch64/dct.h @@ -1,9 +1,10 @@ /***************************************************************************** - * dct.h: AArch64 transform and zigzag + * dct.h: aarch64 transform and zigzag ***************************************************************************** * Copyright (C) 2009-2014 x264 project * * Authors: David Conrad + * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -48,5 +49,18 @@ void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] ); void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] ); void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] ); + +int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst ); +int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc ); +int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst ); +int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc ); + +int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst ); +int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst ); + +void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz ); #endif diff --git a/common/dct.c b/common/dct.c index 08f4e893..e1fb42a3 100644 --- a/common/dct.c +++ b/common/dct.c @@ -1004,7 +1004,20 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig #endif #if HAVE_ARMV6 || ARCH_AARCH64 if( cpu&X264_CPU_NEON ) - pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; + { + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; +#if ARCH_AARCH64 + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_neon; + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_neon; + pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_neon; + pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_neon; + pf_interlaced->sub_8x8 = x264_zigzag_sub_8x8_field_neon; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_neon; + pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_neon; + pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon; + pf_progressive->sub_8x8 = x264_zigzag_sub_8x8_frame_neon; +#endif // ARCH_AARCH64 + } #endif // HAVE_ARMV6 || ARCH_AARCH64 #endif // HIGH_BIT_DEPTH @@ -1047,4 +1060,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig } #endif // HIGH_BIT_DEPTH #endif +#if !HIGH_BIT_DEPTH +#if ARCH_AARCH64 + if( cpu&X264_CPU_NEON ) + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon; + } +#endif // ARCH_AARCH64 +#endif // !HIGH_BIT_DEPTH } diff --git a/common/deblock.c b/common/deblock.c index 382eb721..51f7782b 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -835,18 +835,18 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) { pf->deblock_luma[1] = x264_deblock_v_luma_altivec; pf->deblock_luma[0] = x264_deblock_h_luma_altivec; - } + } #endif // HAVE_ALTIVEC #if HAVE_ARMV6 || ARCH_AARCH64 - if( cpu&X264_CPU_NEON ) - { + if( cpu&X264_CPU_NEON ) + { pf->deblock_luma[1] = x264_deblock_v_luma_neon; pf->deblock_luma[0] = x264_deblock_h_luma_neon; pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon; pf->deblock_strength = x264_deblock_strength_neon; - } + } #endif #endif // !HIGH_BIT_DEPTH