From: Janne Grunau Date: Fri, 18 Jul 2014 13:49:10 +0000 (+0100) Subject: aarch64: deblocking NEON asm X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1343db872b1d7d43dc7fb431a8207efb5ca31e2e;p=libx264 aarch64: deblocking NEON asm Deblock chroma/luma are based on libav's h264 aarch64 NEON deblocking filter which was ported by me from the existing ARM NEON asm. No additional persons to ask for a relicense. --- diff --git a/Makefile b/Makefile index 397b54de..171b46d9 100644 --- a/Makefile +++ b/Makefile @@ -127,6 +127,7 @@ endif ifeq ($(ARCH),AARCH64) ifneq ($(AS),) ASMSRC += common/aarch64/dct-a.S \ + common/aarch64/deblock-a.S \ common/aarch64/mc-a.S \ common/aarch64/pixel-a.S \ common/aarch64/predict-a.S \ diff --git a/common/aarch64/asm.S b/common/aarch64/asm.S index bf3169fe..e689a42c 100644 --- a/common/aarch64/asm.S +++ b/common/aarch64/asm.S @@ -107,6 +107,11 @@ MACH .const_data sub \sub, \a, \b .endm +.macro unzip t1, t2, s1, s2 + uzp1 \t1, \s1, \s2 + uzp2 \t2, \s1, \s2 +.endm + .macro transpose t1, t2, s1, s2 trn1 \t1, \s1, \s2 trn2 \t2, \s1, \s2 @@ -158,3 +163,59 @@ MACH .const_data trn1 \r3\().2D, \r9\().2D, \r7\().2D trn2 \r7\().2D, \r9\().2D, \r7\().2D .endm + +.macro transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 + trn1 \t0\().16b, \r0\().16b, \r1\().16b + trn2 \t1\().16b, \r0\().16b, \r1\().16b + trn1 \r1\().16b, \r2\().16b, \r3\().16b + trn2 \r3\().16b, \r2\().16b, \r3\().16b + trn1 \r0\().16b, \r4\().16b, \r5\().16b + trn2 \r5\().16b, \r4\().16b, \r5\().16b + trn1 \r2\().16b, \r6\().16b, \r7\().16b + trn2 \r7\().16b, \r6\().16b, \r7\().16b + + trn1 \r4\().8h, \r0\().8h, \r2\().8h + trn2 \r2\().8h, \r0\().8h, \r2\().8h + trn1 \r6\().8h, \r5\().8h, \r7\().8h + trn2 \r7\().8h, \r5\().8h, \r7\().8h + trn1 \r5\().8h, \t1\().8h, \r3\().8h + trn2 \t1\().8h, \t1\().8h, \r3\().8h + trn1 \r3\().8h, \t0\().8h, \r1\().8h + trn2 \t0\().8h, \t0\().8h, \r1\().8h + + trn1 \r0\().4s, \r3\().4s, \r4\().4s + trn2 \r4\().4s, \r3\().4s, \r4\().4s + + trn1 \r1\().4s, \r5\().4s, \r6\().4s + trn2 \r5\().4s, \r5\().4s, \r6\().4s + + trn2 \r6\().4s, \t0\().4s, \r2\().4s + trn1 \r2\().4s, \t0\().4s, \r2\().4s + + trn1 \r3\().4s, \t1\().4s, \r7\().4s + trn2 \r7\().4s, \t1\().4s, \r7\().4s +.endm + +.macro transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().16b, \r0\().16b, \r1\().16b + trn2 \t5\().16b, \r0\().16b, \r1\().16b + trn1 \t6\().16b, \r2\().16b, \r3\().16b + trn2 \t7\().16b, \r2\().16b, \r3\().16b + + trn1 \r0\().8h, \t4\().8h, \t6\().8h + trn2 \r2\().8h, \t4\().8h, \t6\().8h + trn1 \r1\().8h, \t5\().8h, \t7\().8h + trn2 \r3\().8h, \t5\().8h, \t7\().8h +.endm + +.macro transpose_4x8.b r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().8b, \r0\().8b, \r1\().8b + trn2 \t5\().8b, \r0\().8b, \r1\().8b + trn1 \t6\().8b, \r2\().8b, \r3\().8b + trn2 \t7\().8b, \r2\().8b, \r3\().8b + + trn1 \r0\().4h, \t4\().4h, \t6\().4h + trn2 \r2\().4h, \t4\().4h, \t6\().4h + trn1 \r1\().4h, \t5\().4h, \t7\().4h + trn2 \r3\().4h, \t5\().4h, \t7\().4h +.endm diff --git a/common/aarch64/deblock-a.S b/common/aarch64/deblock-a.S new file mode 100644 index 00000000..00be8e70 --- /dev/null +++ b/common/aarch64/deblock-a.S @@ -0,0 +1,392 @@ +/***************************************************************************** + * deblock.S: aarch64 deblocking + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: Mans Rullgard + * Janne Grunau + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +.macro h264_loop_filter_start + cmp w2, #0 + ldr w6, [x4] + ccmp w3, #0, #0, ne + mov v24.s[0], w6 + and w6, w6, w6, lsl #16 + b.eq 1f + ands w6, w6, w6, lsl #8 + b.ge 2f +1: + ret +2: +.endm + +.macro h264_loop_filter_luma + dup v22.16b, w2 // alpha + uxtl v24.8h, v24.8b + uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0) + uxtl v24.4s, v24.4h + uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0) + sli v24.8h, v24.8h, #8 + uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0) + sli v24.4s, v24.4s, #16 + cmhi v21.16b, v22.16b, v21.16b // < alpha + dup v22.16b, w3 // beta + cmlt v23.16b, v24.16b, #0 + cmhi v28.16b, v22.16b, v28.16b // < beta + cmhi v30.16b, v22.16b, v30.16b // < beta + bic v21.16b, v21.16b, v23.16b + uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0) + and v21.16b, v21.16b, v28.16b + uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0) + cmhi v17.16b, v22.16b, v17.16b // < beta + and v21.16b, v21.16b, v30.16b + cmhi v19.16b, v22.16b, v19.16b // < beta + and v17.16b, v17.16b, v21.16b + and v19.16b, v19.16b, v21.16b + and v24.16b, v24.16b, v21.16b + urhadd v28.16b, v16.16b, v0.16b + sub v21.16b, v24.16b, v17.16b + uqadd v23.16b, v18.16b, v24.16b + uhadd v20.16b, v20.16b, v28.16b + sub v21.16b, v21.16b, v19.16b + uhadd v28.16b, v4.16b, v28.16b + umin v23.16b, v23.16b, v20.16b + uqsub v22.16b, v18.16b, v24.16b + uqadd v4.16b, v2.16b, v24.16b + umax v23.16b, v23.16b, v22.16b + uqsub v22.16b, v2.16b, v24.16b + umin v28.16b, v4.16b, v28.16b + uxtl v4.8h, v0.8b + umax v28.16b, v28.16b, v22.16b + uxtl2 v20.8h, v0.16b + usubw v4.8h, v4.8h, v16.8b + usubw2 v20.8h, v20.8h, v16.16b + shl v4.8h, v4.8h, #2 + shl v20.8h, v20.8h, #2 + uaddw v4.8h, v4.8h, v18.8b + uaddw2 v20.8h, v20.8h, v18.16b + usubw v4.8h, v4.8h, v2.8b + usubw2 v20.8h, v20.8h, v2.16b + rshrn v4.8b, v4.8h, #3 + rshrn2 v4.16b, v20.8h, #3 + bsl v17.16b, v23.16b, v18.16b + bsl v19.16b, v28.16b, v2.16b + neg v23.16b, v21.16b + uxtl v28.8h, v16.8b + smin v4.16b, v4.16b, v21.16b + uxtl2 v21.8h, v16.16b + smax v4.16b, v4.16b, v23.16b + uxtl v22.8h, v0.8b + uxtl2 v24.8h, v0.16b + saddw v28.8h, v28.8h, v4.8b + saddw2 v21.8h, v21.8h, v4.16b + ssubw v22.8h, v22.8h, v4.8b + ssubw2 v24.8h, v24.8h, v4.16b + sqxtun v16.8b, v28.8h + sqxtun2 v16.16b, v21.8h + sqxtun v0.8b, v22.8h + sqxtun2 v0.16b, v24.8h +.endm + +function x264_deblock_v_luma_neon, export=1 + h264_loop_filter_start + + ld1 {v0.16b}, [x0], x1 + ld1 {v2.16b}, [x0], x1 + ld1 {v4.16b}, [x0], x1 + sub x0, x0, x1, lsl #2 + sub x0, x0, x1, lsl #1 + ld1 {v20.16b}, [x0], x1 + ld1 {v18.16b}, [x0], x1 + ld1 {v16.16b}, [x0], x1 + + h264_loop_filter_luma + + sub x0, x0, x1, lsl #1 + st1 {v17.16b}, [x0], x1 + st1 {v16.16b}, [x0], x1 + st1 {v0.16b}, [x0], x1 + st1 {v19.16b}, [x0] + + ret +endfunc + +function x264_deblock_h_luma_neon, export=1 + h264_loop_filter_start + + sub x0, x0, #4 + ld1 {v6.8b}, [x0], x1 + ld1 {v20.8b}, [x0], x1 + ld1 {v18.8b}, [x0], x1 + ld1 {v16.8b}, [x0], x1 + ld1 {v0.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + ld1 {v4.8b}, [x0], x1 + ld1 {v26.8b}, [x0], x1 + ld1 {v6.d}[1], [x0], x1 + ld1 {v20.d}[1], [x0], x1 + ld1 {v18.d}[1], [x0], x1 + ld1 {v16.d}[1], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + ld1 {v4.d}[1], [x0], x1 + ld1 {v26.d}[1], [x0], x1 + + transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 + + h264_loop_filter_luma + + transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27 + + sub x0, x0, x1, lsl #4 + add x0, x0, #2 + st1 {v17.s}[0], [x0], x1 + st1 {v16.s}[0], [x0], x1 + st1 {v0.s}[0], [x0], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v17.s}[1], [x0], x1 + st1 {v16.s}[1], [x0], x1 + st1 {v0.s}[1], [x0], x1 + st1 {v19.s}[1], [x0], x1 + st1 {v17.s}[2], [x0], x1 + st1 {v16.s}[2], [x0], x1 + st1 {v0.s}[2], [x0], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v17.s}[3], [x0], x1 + st1 {v16.s}[3], [x0], x1 + st1 {v0.s}[3], [x0], x1 + st1 {v19.s}[3], [x0], x1 + + ret +endfunc + +.macro h264_loop_filter_chroma + dup v22.16b, w2 // alpha + uxtl v24.8h, v24.8b + uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0) + uxtl v4.8h, v0.8b + uxtl2 v5.8h, v0.16b + uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0) + usubw v4.8h, v4.8h, v16.8b + usubw2 v5.8h, v5.8h, v16.16b + sli v24.8h, v24.8h, #8 + shl v4.8h, v4.8h, #2 + shl v5.8h, v5.8h, #2 + uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0) + uxtl v24.4s, v24.4h + uaddw v4.8h, v4.8h, v18.8b + uaddw2 v5.8h, v5.8h, v18.16b + cmhi v26.16b, v22.16b, v26.16b // < alpha + usubw v4.8h, v4.8h, v2.8b + usubw2 v5.8h, v5.8h, v2.16b + sli v24.4s, v24.4s, #16 + dup v22.16b, w3 // beta + rshrn v4.8b, v4.8h, #3 + rshrn2 v4.16b, v5.8h, #3 + cmhi v28.16b, v22.16b, v28.16b // < beta + cmhi v30.16b, v22.16b, v30.16b // < beta + smin v4.16b, v4.16b, v24.16b + neg v25.16b, v24.16b + and v26.16b, v26.16b, v28.16b + smax v4.16b, v4.16b, v25.16b + and v26.16b, v26.16b, v30.16b + uxtl v22.8h, v0.8b + uxtl2 v23.8h, v0.16b + and v4.16b, v4.16b, v26.16b + uxtl v28.8h, v16.8b + uxtl2 v29.8h, v16.16b + saddw v28.8h, v28.8h, v4.8b + saddw2 v29.8h, v29.8h, v4.16b + ssubw v22.8h, v22.8h, v4.8b + ssubw2 v23.8h, v23.8h, v4.16b + sqxtun v16.8b, v28.8h + sqxtun v0.8b, v22.8h + sqxtun2 v16.16b, v29.8h + sqxtun2 v0.16b, v23.8h +.endm + +function x264_deblock_v_chroma_neon, export=1 + h264_loop_filter_start + + sub x0, x0, x1, lsl #1 + ld1 {v18.16b}, [x0], x1 + ld1 {v16.16b}, [x0], x1 + ld1 {v0.16b}, [x0], x1 + ld1 {v2.16b}, [x0] + + h264_loop_filter_chroma + + sub x0, x0, x1, lsl #1 + st1 {v16.16b}, [x0], x1 + st1 {v0.16b}, [x0], x1 + + ret +endfunc + +function x264_deblock_h_chroma_neon, export=1 + h264_loop_filter_start + + sub x0, x0, #4 + ld1 {v18.d}[0], [x0], x1 + ld1 {v16.d}[0], [x0], x1 + ld1 {v0.d}[0], [x0], x1 + ld1 {v2.d}[0], [x0], x1 + ld1 {v18.d}[1], [x0], x1 + ld1 {v16.d}[1], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + + transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31 + + h264_loop_filter_chroma + + transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31 + + sub x0, x0, x1, lsl #3 + st1 {v18.d}[0], [x0], x1 + st1 {v16.d}[0], [x0], x1 + st1 {v0.d}[0], [x0], x1 + st1 {v2.d}[0], [x0], x1 + st1 {v18.d}[1], [x0], x1 + st1 {v16.d}[1], [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v2.d}[1], [x0], x1 + + ret +endfunc + + +//static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], +// int8_t ref[2][X264_SCAN8_LUMA_SIZE], +// int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], +// uint8_t bs[2][8][4], int mvy_limit, +// int bframe ) +function x264_deblock_strength_neon, export=1 + movi v4.16b, #0 + lsl w4, w4, #8 + add x3, x3, #32 + sub w4, w4, #(1<<8)-3 + movi v5.16b, #0 + dup v6.8h, w4 + mov x6, #-32 + +bframe: + // load bytes ref + add x2, x2, #16 + ld1 {v31.d}[1], [x1], #8 + ld1 {v1.16b}, [x1], #16 + movi v0.16b, #0 + ld1 {v2.16b}, [x1], #16 + ext v3.16b, v0.16b, v1.16b, #15 + ext v0.16b, v0.16b, v2.16b, #15 + unzip v21.4s, v22.4s, v1.4s, v2.4s + unzip v23.4s, v20.4s, v3.4s, v0.4s + ext v21.16b, v31.16b, v22.16b, #12 + + eor v0.16b, v20.16b, v22.16b + eor v1.16b, v21.16b, v22.16b + orr v4.16b, v4.16b, v0.16b + orr v5.16b, v5.16b, v1.16b + + ld1 {v21.8h}, [x2], #16 // mv + 0x10 + ld1 {v19.8h}, [x2], #16 // mv + 0x20 + ld1 {v22.8h}, [x2], #16 // mv + 0x30 + ld1 {v18.8h}, [x2], #16 // mv + 0x40 + ld1 {v23.8h}, [x2], #16 // mv + 0x50 + ext v19.16b, v19.16b, v22.16b, #12 + ext v18.16b, v18.16b, v23.16b, #12 + sabd v0.8h, v22.8h, v19.8h + ld1 {v19.8h}, [x2], #16 // mv + 0x60 + sabd v1.8h, v23.8h, v18.8h + ld1 {v24.8h}, [x2], #16 // mv + 0x70 + uqxtn v0.8b, v0.8h + ld1 {v18.8h}, [x2], #16 // mv + 0x80 + ld1 {v25.8h}, [x2], #16 // mv + 0x90 + uqxtn2 v0.16b, v1.8h + ext v19.16b, v19.16b, v24.16b, #12 + ext v18.16b, v18.16b, v25.16b, #12 + sabd v1.8h, v24.8h, v19.8h + sabd v2.8h, v25.8h, v18.8h + uqxtn v1.8b, v1.8h + uqxtn2 v1.16b, v2.8h + + uqsub v0.16b, v0.16b, v6.16b + uqsub v1.16b, v1.16b, v6.16b + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h + + sabd v1.8h, v22.8h, v23.8h + orr v4.16b, v4.16b, v0.16b + + sabd v0.8h, v21.8h, v22.8h + sabd v2.8h, v23.8h, v24.8h + sabd v3.8h, v24.8h, v25.8h + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h + uqxtn v1.8b, v2.8h + uqxtn2 v1.16b, v3.8h + + uqsub v0.16b, v0.16b, v6.16b + uqsub v1.16b, v1.16b, v6.16b + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h + subs w5, w5, #1 + orr v5.16b, v5.16b, v0.16b + b.eq bframe + + movi v6.16b, #1 + // load bytes nnz + ld1 {v31.d}[1], [x0], #8 + ld1 {v1.16b}, [x0], #16 + movi v0.16b, #0 + ld1 {v2.16b}, [x0], #16 + ext v3.16b, v0.16b, v1.16b, #15 + ext v0.16b, v0.16b, v2.16b, #15 + unzip v21.4s, v22.4s, v1.4s, v2.4s + unzip v23.4s, v20.4s, v3.4s, v0.4s + ext v21.16b, v31.16b, v22.16b, #12 + + movrel x7, transpose_table + ld1 {v7.16b}, [x7] + orr v0.16b, v20.16b, v22.16b + orr v1.16b, v21.16b, v22.16b + umin v0.16b, v0.16b, v6.16b + umin v1.16b, v1.16b, v6.16b + umin v4.16b, v4.16b, v6.16b // mv ? 1 : 0 + umin v5.16b, v5.16b, v6.16b + add v0.16b, v0.16b, v0.16b // nnz ? 2 : 0 + add v1.16b, v1.16b, v1.16b + umax v4.16b, v4.16b, v0.16b + umax v5.16b, v5.16b, v1.16b + tbl v6.16b, {v4.16b}, v7.16b + st1 {v5.16b}, [x3], x6 // bs[1] + st1 {v6.16b}, [x3] // bs[0] + ret +endfunc + +const transpose_table + .byte 0, 4, 8, 12 + .byte 1, 5, 9, 13 + .byte 2, 6, 10, 14 + .byte 3, 7, 11, 15 +endconst diff --git a/common/deblock.c b/common/deblock.c index 6b369f2a..382eb721 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -729,7 +729,7 @@ void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #endif // ARCH_PPC -#if HAVE_ARMV6 +#if HAVE_ARMV6 || ARCH_AARCH64 void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); @@ -838,7 +838,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) } #endif // HAVE_ALTIVEC -#if HAVE_ARMV6 +#if HAVE_ARMV6 || ARCH_AARCH64 if( cpu&X264_CPU_NEON ) { pf->deblock_luma[1] = x264_deblock_v_luma_neon;