From 3c1fa5d9b2ea62f05473080313c543b7e795b307 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Fri, 18 Jul 2014 09:29:35 +0100 Subject: [PATCH] aarch64: intra predition NEON asm Ported from the ARM NEON asm. --- Makefile | 4 +- common/aarch64/predict-a.S | 661 +++++++++++++++++++++++++++++++++++++ common/aarch64/predict-c.c | 114 +++++++ common/aarch64/predict.h | 52 +++ common/pixel.c | 19 +- common/predict.c | 19 ++ 6 files changed, 857 insertions(+), 12 deletions(-) create mode 100644 common/aarch64/predict-a.S create mode 100644 common/aarch64/predict-c.c create mode 100644 common/aarch64/predict.h diff --git a/Makefile b/Makefile index b0d4a149..397b54de 100644 --- a/Makefile +++ b/Makefile @@ -129,8 +129,10 @@ ifneq ($(AS),) ASMSRC += common/aarch64/dct-a.S \ common/aarch64/mc-a.S \ common/aarch64/pixel-a.S \ + common/aarch64/predict-a.S \ common/aarch64/quant-a.S -SRCS += common/aarch64/mc-c.c +SRCS += common/aarch64/mc-c.c \ + common/aarch64/predict-c.c OBJASM = $(ASMSRC:%.S=%.o) endif endif diff --git a/common/aarch64/predict-a.S b/common/aarch64/predict-a.S new file mode 100644 index 00000000..8b2283cf --- /dev/null +++ b/common/aarch64/predict-a.S @@ -0,0 +1,661 @@ +/***************************************************************************** + * predict.S: aarch64 intra prediction + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad + * Mans Rullgard + * Janne Grunau + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +const p8weight, align=4 + .short 1, 2, 3, 4, 1, 2, 3, 4 +endconst +const p16weight, align=4 + .short 1, 2, 3, 4, 5, 6, 7, 8 +endconst + +.macro ldcol.8 vd, xn, xm, n=8, hi=0 +.if \n == 8 || \hi == 0 + ld1 {\vd\().b}[0], [\xn], \xm + ld1 {\vd\().b}[1], [\xn], \xm + ld1 {\vd\().b}[2], [\xn], \xm + ld1 {\vd\().b}[3], [\xn], \xm +.endif +.if \n == 8 || \hi == 1 + ld1 {\vd\().b}[4], [\xn], \xm + ld1 {\vd\().b}[5], [\xn], \xm + ld1 {\vd\().b}[6], [\xn], \xm + ld1 {\vd\().b}[7], [\xn], \xm +.endif +.endm + +.macro ldcol.16 vd, xn, xm + ldcol.8 \vd, \xn, \xm + ld1 {\vd\().b}[ 8], [\xn], \xm + ld1 {\vd\().b}[ 9], [\xn], \xm + ld1 {\vd\().b}[10], [\xn], \xm + ld1 {\vd\().b}[11], [\xn], \xm + ld1 {\vd\().b}[12], [\xn], \xm + ld1 {\vd\().b}[13], [\xn], \xm + ld1 {\vd\().b}[14], [\xn], \xm + ld1 {\vd\().b}[15], [\xn], \xm +.endm + + +function x264_predict_4x4_h_aarch64, export=1 + ldrb w1, [x0, #0*FDEC_STRIDE-1] + ldrb w2, [x0, #1*FDEC_STRIDE-1] + ldrb w3, [x0, #2*FDEC_STRIDE-1] + ldrb w4, [x0, #3*FDEC_STRIDE-1] + add w1, w1, w1, lsl #8 + add w2, w2, w2, lsl #8 + add w3, w3, w3, lsl #8 + add w4, w4, w4, lsl #8 + add w1, w1, w1, lsl #16 + str w1, [x0, #0*FDEC_STRIDE] + add w2, w2, w2, lsl #16 + str w2, [x0, #1*FDEC_STRIDE] + add w3, w3, w3, lsl #16 + str w3, [x0, #2*FDEC_STRIDE] + add w4, w4, w4, lsl #16 + str w4, [x0, #3*FDEC_STRIDE] + ret +endfunc + +function x264_predict_4x4_v_aarch64, export=1 + ldr w1, [x0, #0 - 1 * FDEC_STRIDE] + str w1, [x0, #0 + 0 * FDEC_STRIDE] + str w1, [x0, #0 + 1 * FDEC_STRIDE] + str w1, [x0, #0 + 2 * FDEC_STRIDE] + str w1, [x0, #0 + 3 * FDEC_STRIDE] + ret +endfunc + +function x264_predict_4x4_dc_neon, export=1 + sub x1, x0, #FDEC_STRIDE + sub x2, x0, #1 + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x1] + ld1r {v1.8b}, [x2], x7 + ld1r {v2.8b}, [x2], x7 + ld1r {v3.8b}, [x2], x7 + ld1r {v4.8b}, [x2], x7 + uaddlp v0.4h, v0.8b + uaddl v1.8h, v1.8b, v2.8b + uaddl v2.8h, v3.8b, v4.8b + addp v0.4h, v0.4h, v0.4h + add v1.4h, v1.4h, v2.4h + dup v0.4h, v0.h[0] + add v0.4h, v0.4h, v1.4h + rshrn v0.8b, v0.8h, #3 + str s0, [x0], #FDEC_STRIDE + str s0, [x0], #FDEC_STRIDE + str s0, [x0], #FDEC_STRIDE + str s0, [x0] + ret +endfunc + +function x264_predict_4x4_dc_top_neon, export=1 + sub x1, x0, #FDEC_STRIDE + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x1] + uaddlp v0.4h, v0.8b + addp v0.4h, v0.4h, v0.4h + dup v0.4h, v0.h[0] + rshrn v0.8b, v0.8h, #2 + str s0, [x0], #FDEC_STRIDE + str s0, [x0], #FDEC_STRIDE + str s0, [x0], #FDEC_STRIDE + str s0, [x0] + ret +endfunc + +function x264_predict_4x4_ddr_neon, export=1 + sub x1, x0, #FDEC_STRIDE+1 + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1 + ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1 + ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1 + ext v0.8b, v1.8b, v0.8b, #7 + ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1 + ext v0.8b, v2.8b, v0.8b, #7 // a + ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1 + ext v1.8b, v3.8b, v0.8b, #7 // b + ext v2.8b, v4.8b, v1.8b, #7 // c + uaddl v0.8h, v0.8b, v1.8b + uaddl v1.8h, v1.8b, v2.8b + add v0.8h, v0.8h, v1.8h + rshrn v0.8b, v0.8h, #2 + + ext v3.8b, v0.8b, v0.8b, #3 + ext v2.8b, v0.8b, v0.8b, #2 + ext v1.8b, v0.8b, v0.8b, #1 + + str s3, [x0], #FDEC_STRIDE + str s2, [x0], #FDEC_STRIDE + str s1, [x0], #FDEC_STRIDE + str s0, [x0] + ret +endfunc + +function x264_predict_4x4_ddl_neon, export=1 + sub x0, x0, #FDEC_STRIDE + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x0], x7 + dup v3.8b, v0.b[7] + ext v1.8b, v0.8b, v0.8b, #1 + ext v2.8b, v0.8b, v3.8b, #2 + uhadd v0.8b, v0.8b, v2.8b + urhadd v0.8b, v0.8b, v1.8b + str s0, [x0], #FDEC_STRIDE + ext v1.8b, v0.8b, v0.8b, #1 + ext v2.8b, v0.8b, v0.8b, #2 + str s1, [x0], #FDEC_STRIDE + ext v3.8b, v0.8b, v0.8b, #3 + str s2, [x0], #FDEC_STRIDE + str s3, [x0] + ret +endfunc + +function x264_predict_8x8_dc_neon, export=1 + mov x7, #FDEC_STRIDE + ld1 {v0.16b}, [x1], #16 + ld1 {v1.8b}, [x1] + ext v0.16b, v0.16b, v0.16b, #7 + uaddlv h1, v1.8b + uaddlv h0, v0.8b + add v0.8h, v0.8h, v1.8h + dup v0.8h, v0.h[0] + rshrn v0.8b, v0.8h, #4 +.rept 8 + st1 {v0.8b}, [x0], x7 +.endr + ret +endfunc + +function x264_predict_8x8_h_neon, export=1 + mov x7, #FDEC_STRIDE + ld1 {v16.16b}, [x1] + dup v0.8b, v16.b[14] + dup v1.8b, v16.b[13] + st1 {v0.8b}, [x0], x7 + dup v2.8b, v16.b[12] + st1 {v1.8b}, [x0], x7 + dup v3.8b, v16.b[11] + st1 {v2.8b}, [x0], x7 + dup v4.8b, v16.b[10] + st1 {v3.8b}, [x0], x7 + dup v5.8b, v16.b[9] + st1 {v4.8b}, [x0], x7 + dup v6.8b, v16.b[8] + st1 {v5.8b}, [x0], x7 + dup v7.8b, v16.b[7] + st1 {v6.8b}, [x0], x7 + st1 {v7.8b}, [x0], x7 + ret +endfunc + +function x264_predict_8x8_v_neon, export=1 + add x1, x1, #16 + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x1] +.rept 8 + st1 {v0.8b}, [x0], x7 +.endr + ret +endfunc + +function x264_predict_8x8_ddl_neon, export=1 + add x1, x1, #16 + mov x7, #FDEC_STRIDE + ld1 {v0.16b}, [x1] + movi v3.16b, #0 + dup v2.16b, v0.b[15] + ext v4.16b, v3.16b, v0.16b, #15 + ext v2.16b, v0.16b, v2.16b, #1 + uhadd v4.16b, v4.16b, v2.16b + urhadd v0.16b, v0.16b, v4.16b + ext v1.16b, v0.16b, v0.16b, #1 + ext v2.16b, v0.16b, v0.16b, #2 + st1 {v1.8b}, [x0], x7 + ext v3.16b, v0.16b, v0.16b, #3 + st1 {v2.8b}, [x0], x7 + ext v4.16b, v0.16b, v0.16b, #4 + st1 {v3.8b}, [x0], x7 + ext v5.16b, v0.16b, v0.16b, #5 + st1 {v4.8b}, [x0], x7 + ext v6.16b, v0.16b, v0.16b, #6 + st1 {v5.8b}, [x0], x7 + ext v7.16b, v0.16b, v0.16b, #7 + st1 {v6.8b}, [x0], x7 + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v7.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 + ret +endfunc + +function x264_predict_8x8_ddr_neon, export=1 + ld1 {v0.16b,v1.16b}, [x1] + ext v2.16b, v0.16b, v1.16b, #7 + ext v4.16b, v0.16b, v1.16b, #9 + ext v3.16b, v0.16b, v1.16b, #8 + + uhadd v2.16b, v2.16b, v4.16b + urhadd v7.16b, v3.16b, v2.16b + + add x0, x0, #7*FDEC_STRIDE + mov x7, #-1*FDEC_STRIDE + + ext v6.16b, v7.16b, v7.16b, #1 + st1 {v7.8b}, [x0], x7 + ext v5.16b, v7.16b, v7.16b, #2 + st1 {v6.8b}, [x0], x7 + ext v4.16b, v7.16b, v7.16b, #3 + st1 {v5.8b}, [x0], x7 + ext v3.16b, v7.16b, v7.16b, #4 + st1 {v4.8b}, [x0], x7 + ext v2.16b, v7.16b, v7.16b, #5 + st1 {v3.8b}, [x0], x7 + ext v1.16b, v7.16b, v7.16b, #6 + st1 {v2.8b}, [x0], x7 + ext v0.16b, v7.16b, v7.16b, #7 + st1 {v1.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 + ret +endfunc + +function x264_predict_8x8_vl_neon, export=1 + add x1, x1, #16 + mov x7, #FDEC_STRIDE + + ld1 {v0.16b}, [x1] + ext v1.16b, v1.16b, v0.16b, #15 + ext v2.16b, v0.16b, v2.16b, #1 + + uhadd v1.16b, v1.16b, v2.16b + urhadd v3.16b, v0.16b, v2.16b + + urhadd v0.16b, v0.16b, v1.16b + + ext v4.16b, v0.16b, v0.16b, #1 + st1 {v3.8b}, [x0], x7 + ext v5.16b, v3.16b, v3.16b, #1 + st1 {v4.8b}, [x0], x7 + ext v6.16b, v0.16b, v0.16b, #2 + st1 {v5.8b}, [x0], x7 + ext v7.16b, v3.16b, v3.16b, #2 + st1 {v6.8b}, [x0], x7 + ext v4.16b, v0.16b, v0.16b, #3 + st1 {v7.8b}, [x0], x7 + ext v5.16b, v3.16b, v3.16b, #3 + st1 {v4.8b}, [x0], x7 + ext v6.16b, v0.16b, v0.16b, #4 + st1 {v5.8b}, [x0], x7 + st1 {v6.8b}, [x0], x7 + ret +endfunc + +function x264_predict_8x8_vr_neon, export=1 + add x1, x1, #8 + mov x7, #FDEC_STRIDE + ld1 {v2.16b}, [x1] + + ext v1.16b, v2.16b, v2.16b, #14 + ext v0.16b, v2.16b, v2.16b, #15 + + uhadd v3.16b, v2.16b, v1.16b + urhadd v2.16b, v2.16b, v0.16b + urhadd v0.16b, v0.16b, v3.16b + + ext v1.16b, v2.16b, v2.16b, #8 + uzp1 v2.8b, v0.8b, v0.8b + uzp2 v3.8b, v0.8b, v0.8b + ext v0.16b, v0.16b, v0.16b, #8 + + st1 {v1.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 + ext v4.8b, v3.8b, v1.8b, #7 + ext v5.8b, v2.8b, v0.8b, #7 + st1 {v4.8b}, [x0], x7 + st1 {v5.8b}, [x0], x7 + ext v6.8b, v3.8b, v1.8b, #6 + ext v7.8b, v2.8b, v0.8b, #6 + st1 {v6.8b}, [x0], x7 + st1 {v7.8b}, [x0], x7 + ext v1.8b, v3.8b, v1.8b, #5 + ext v0.8b, v2.8b, v0.8b, #5 + st1 {v1.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 + ret +endfunc + +function x264_predict_8x8_hd_neon, export=1 + add x1, x1, #7 + mov x7, #FDEC_STRIDE + + ld1 {v1.16b}, [x1] + ext v3.16b, v1.16b, v1.16b, #1 + ext v2.16b, v1.16b, v1.16b, #2 + + urhadd v4.16b, v1.16b, v3.16b + + uhadd v1.16b, v1.16b, v2.16b + urhadd v0.16b, v1.16b, v3.16b + + zip1 v16.8b, v4.8b, v0.8b + zip2 v17.8b, v4.8b, v0.8b + ext v7.16b, v0.16b, v0.16b, #8 + + ext v0.8b, v17.8b, v7.8b, #6 + ext v1.8b, v17.8b, v7.8b, #4 + st1 {v0.8b}, [x0], x7 + ext v2.8b, v17.8b, v7.8b, #2 + st1 {v1.8b}, [x0], x7 + st1 {v2.8b}, [x0], x7 + ext v3.8b, v16.8b, v17.8b, #6 + st1 {v17.8b}, [x0], x7 + ext v4.8b, v16.8b, v17.8b, #4 + st1 {v3.8b}, [x0], x7 + ext v5.8b, v16.8b, v17.8b, #2 + st1 {v4.8b}, [x0], x7 + st1 {v5.8b}, [x0], x7 + st1 {v16.8b}, [x0], x7 + + ret +endfunc + +function x264_predict_8x8_hu_neon, export=1 + add x1, x1, #7 + mov x7, #FDEC_STRIDE + ld1 {v7.8b}, [x1] + dup v6.8b, v7.b[0] + rev64 v7.8b, v7.8b + + ext v4.8b, v7.8b, v6.8b, #2 + ext v2.8b, v7.8b, v6.8b, #1 + + uhadd v5.8b, v7.8b, v4.8b + urhadd v0.8b, v2.8b, v7.8b + urhadd v1.8b, v5.8b, v2.8b + + zip1 v16.8b, v0.8b, v1.8b + zip2 v17.8b, v0.8b, v1.8b + + dup v18.4h, v17.h[3] + + ext v0.8b, v16.8b, v17.8b, #2 + ext v1.8b, v16.8b, v17.8b, #4 + ext v2.8b, v16.8b, v17.8b, #6 + st1 {v16.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 + st1 {v1.8b}, [x0], x7 + st1 {v2.8b}, [x0], x7 + + ext v4.8b, v17.8b, v18.8b, #2 + ext v5.8b, v17.8b, v18.8b, #4 + ext v6.8b, v17.8b, v18.8b, #6 + st1 {v17.8b}, [x0], x7 + st1 {v4.8b}, [x0], x7 + st1 {v5.8b}, [x0], x7 + st1 {v6.8b}, [x0] + ret +endfunc + + +function x264_predict_8x8c_dc_top_neon, export=1 + sub x2, x0, #FDEC_STRIDE + mov x1, #FDEC_STRIDE + ld1 {v0.8b}, [x2] + uaddlp v0.4h, v0.8b + addp v0.4h, v0.4h, v0.4h + rshrn v0.8b, v0.8h, #2 + dup v3.8b, v0.b[1] + dup v2.8b, v0.b[0] + transpose v0.2s, v1.2s, v2.2s, v3.2s + b pred8x8c_dc_end +endfunc + +function x264_predict_8x8c_dc_left_neon, export=1 + sub x2, x0, #1 + mov x1, #FDEC_STRIDE + ldcol.8 v0, x2, x1 + uaddlp v0.4h, v0.8b + addp v0.4h, v0.4h, v0.4h + rshrn v0.8b, v0.8h, #2 + dup v1.8b, v0.b[1] + dup v0.8b, v0.b[0] + b pred8x8c_dc_end +endfunc + +function x264_predict_8x8c_dc_neon, export=1 + sub x2, x0, #FDEC_STRIDE + sub x3, x0, #1 + mov x1, #FDEC_STRIDE + ld1 {v2.8b}, [x2] + ldcol.8 v3, x3, x1 + transpose v0.2s, v1.2s, v2.2s, v3.2s + uaddlp v0.4h, v0.8b // s0, s2 + uaddlp v1.4h, v1.8b // s1, s3 + addp v0.4h, v0.4h, v1.4h // s0, s2, s1, s3 + addp v1.4h, v0.4h, v0.4h + rshrn v2.8b, v0.8h, #2 + rshrn v3.8b, v1.8h, #3 + dup v5.8b, v2.b[2] // dc1 + dup v6.8b, v3.b[1] // dc2 + dup v4.8b, v3.b[0] // dc0 + dup v7.8b, v2.b[3] // dc3 + trn1 v0.2s, v4.2s, v5.2s + trn1 v1.2s, v7.2s, v6.2s +pred8x8c_dc_end: + add x2, x0, x1, lsl #2 +.rept 4 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x2], x1 +.endr + ret +endfunc + +function x264_predict_8x8c_h_neon, export=1 + sub x1, x0, #1 + mov x7, #FDEC_STRIDE +.rept 4 + ld1r {v0.8b}, [x1], x7 + ld1r {v1.8b}, [x1], x7 + st1 {v0.8b}, [x0], x7 + st1 {v1.8b}, [x0], x7 +.endr + ret +endfunc + +function x264_predict_8x8c_v_neon, export=1 + sub x0, x0, #FDEC_STRIDE + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x0], x7 +.rept 8 + st1 {v0.8b}, [x0], x7 +.endr + ret +endfunc + +function x264_predict_8x8c_p_neon, export=1 + sub x3, x0, #FDEC_STRIDE + mov x1, #FDEC_STRIDE + add x2, x3, #4 + sub x3, x3, #1 + ld1 {v0.s}[0], [x3] + ld1 {v2.s}[0], [x2], x1 + ldcol.8 v0, x3, x1, 4, hi=1 + add x3, x3, x1 + ldcol.8 v3, x3, x1, 4 + movrel x4, p8weight + movrel x5, p16weight + uaddl v4.8h, v2.8b, v3.8b + rev32 v0.8b, v0.8b + trn1 v2.2s, v2.2s, v3.2s + ld1 {v7.8h}, [x4] + usubl v2.8h, v2.8b, v0.8b + mul v2.8h, v2.8h, v7.8h + ld1 {v0.8h}, [x5] + saddlp v2.4s, v2.8h + addp v2.4s, v2.4s, v2.4s + shl v3.2s, v2.2s, #4 + add v2.2s, v2.2s, v3.2s + rshrn v5.4h, v2.4s, #5 // b, c, x, x + addp v2.4h, v5.4h, v5.4h + shl v3.4h, v2.4h, #2 + sub v3.4h, v3.4h, v2.4h // 3 * (b + c) + rev64 v4.4h, v4.4h + add v4.4h, v4.4h, v0.4h + shl v2.4h, v4.4h, #4 // a + sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16 + ext v0.16b, v0.16b, v0.16b, #14 + sub v6.4h, v5.4h, v3.4h + mov v0.h[0], wzr + mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b + dup v1.8h, v2.h[0] // pix + dup v2.8h, v5.h[1] // c + add v1.8h, v1.8h, v0.8h // pix + x*b + mov x3, #8 +1: + subs x3, x3, #1 + sqshrun v0.8b, v1.8h, #5 + add v1.8h, v1.8h, v2.8h + st1 {v0.8b}, [x0], x1 + b.ne 1b + ret +endfunc + + +function x264_predict_16x16_dc_top_neon, export=1 + sub x2, x0, #FDEC_STRIDE + mov x1, #FDEC_STRIDE + ld1 {v0.16b}, [x2] + uaddlv h0, v0.16b + rshrn v0.8b, v0.8h, #4 + dup v0.16b, v0.b[0] + b pred16x16_dc_end +endfunc + +function x264_predict_16x16_dc_left_neon, export=1 + sub x2, x0, #1 + mov x1, #FDEC_STRIDE + ldcol.16 v0, x2, x1 + uaddlv h0, v0.16b + rshrn v0.8b, v0.8h, #4 + dup v0.16b, v0.b[0] + b pred16x16_dc_end +endfunc + +function x264_predict_16x16_dc_neon, export=1 + sub x3, x0, #FDEC_STRIDE + sub x2, x0, #1 + mov x1, #FDEC_STRIDE + ld1 {v0.16b}, [x3] + ldcol.16 v1, x2, x1 + uaddlv h0, v0.16b + uaddlv h1, v1.16b + add v0.4h, v0.4h, v1.4h + rshrn v0.8b, v0.8h, #5 + dup v0.16b, v0.b[0] +pred16x16_dc_end: +.rept 16 + st1 {v0.16b}, [x0], x1 +.endr + ret +endfunc + +function x264_predict_16x16_h_neon, export=1 + sub x1, x0, #1 + mov x7, #FDEC_STRIDE +.rept 8 + ld1r {v0.16b}, [x1], x7 + ld1r {v1.16b}, [x1], x7 + st1 {v0.16b}, [x0], x7 + st1 {v1.16b}, [x0], x7 +.endr + ret +endfunc + +function x264_predict_16x16_v_neon, export=1 + sub x0, x0, #FDEC_STRIDE + mov x7, #FDEC_STRIDE + ld1 {v0.16b}, [x0], x7 +.rept 16 + st1 {v0.16b}, [x0], x7 +.endr + ret +endfunc + +function x264_predict_16x16_p_neon, export=1 + sub x3, x0, #FDEC_STRIDE + mov x1, #FDEC_STRIDE + add x2, x3, #8 + sub x3, x3, #1 + ld1 {v0.8b}, [x3] + ld1 {v2.8b}, [x2], x1 + ldcol.8 v1, x3, x1 + add x3, x3, x1 + ldcol.8 v3, x3, x1 + rev64 v0.8b, v0.8b + rev64 v1.8b, v1.8b + movrel x4, p16weight + uaddl v4.8h, v2.8b, v3.8b + ld1 {v7.8h}, [x4] + usubl v2.8h, v2.8b, v0.8b + usubl v3.8h, v3.8b, v1.8b + mul v2.8h, v2.8h, v7.8h + mul v3.8h, v3.8h, v7.8h + saddlp v2.4s, v2.8h + saddlp v3.4s, v3.8h + addp v2.4s, v2.4s, v3.4s + addp v2.4s, v2.4s, v2.4s + shl v3.2s, v2.2s, #2 + add v2.2s, v2.2s, v3.2s + rshrn v5.4h, v2.4s, #6 // b, c, x, x + addp v2.4h, v5.4h, v5.4h + shl v3.4h, v2.4h, #3 + sub v3.4h, v3.4h, v2.4h // 7 * (b + c) + ext v4.16b, v4.16b, v4.16b, #14 + add v4.4h, v4.4h, v7.4h + shl v2.4h, v4.4h, #4 // a + sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16 + ext v7.16b, v7.16b, v7.16b, #14 + mov v7.h[0], wzr + dup v3.8h, v5.h[0] + mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b + dup v1.8h, v2.h[0] // pix + dup v2.8h, v5.h[1] // c + shl v3.8h, v3.8h, #3 + add v1.8h, v1.8h, v0.8h // pix + x*b + add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b + mov x3, #16 +1: + subs x3, x3, #1 + sqshrun v0.8b, v1.8h, #5 + add v1.8h, v1.8h, v2.8h + sqshrun2 v0.16b, v3.8h, #5 + add v3.8h, v3.8h, v2.8h + st1 {v0.16b}, [x0], x1 + b.ne 1b + ret +endfunc diff --git a/common/aarch64/predict-c.c b/common/aarch64/predict-c.c new file mode 100644 index 00000000..3803b575 --- /dev/null +++ b/common/aarch64/predict-c.c @@ -0,0 +1,114 @@ +/***************************************************************************** + * predict.c: aarch64 intra prediction + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "predict.h" +#include "pixel.h" + +void x264_predict_4x4_dc_top_neon( uint8_t *src ); +void x264_predict_4x4_ddr_neon( uint8_t *src ); +void x264_predict_4x4_ddl_neon( uint8_t *src ); + +void x264_predict_8x8c_dc_top_neon( uint8_t *src ); +void x264_predict_8x8c_dc_left_neon( uint8_t *src ); +void x264_predict_8x8c_p_neon( uint8_t *src ); + +void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); + +void x264_predict_16x16_dc_top_neon( uint8_t *src ); +void x264_predict_16x16_dc_left_neon( uint8_t *src ); +void x264_predict_16x16_p_neon( uint8_t *src ); + +void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] ) +{ +#if !HIGH_BIT_DEPTH + if (cpu&X264_CPU_ARMV8) + { + pf[I_PRED_4x4_H] = x264_predict_4x4_h_aarch64; + pf[I_PRED_4x4_V] = x264_predict_4x4_v_aarch64; + } + + if (cpu&X264_CPU_NEON) + { + pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_neon; + pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon; + pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon; + pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_neon; + } +#endif // !HIGH_BIT_DEPTH +} + +void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] ) +{ + if (!(cpu&X264_CPU_NEON)) + return; + +#if !HIGH_BIT_DEPTH + pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon; + pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon; + pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon; + pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon; + pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon; + pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon; +#endif // !HIGH_BIT_DEPTH +} + +void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) +{ + if (!(cpu&X264_CPU_NEON)) + return; + +#if !HIGH_BIT_DEPTH + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon; + pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon; + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon; + pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon; + pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon; + pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon; + pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon; + pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon; +#endif // !HIGH_BIT_DEPTH +} + +void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] ) +{ + if (!(cpu&X264_CPU_NEON)) + return; + +#if !HIGH_BIT_DEPTH + pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon; + pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon; + pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon; + pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon; + pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon; + pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon; +#endif // !HIGH_BIT_DEPTH +} diff --git a/common/aarch64/predict.h b/common/aarch64/predict.h new file mode 100644 index 00000000..2d26a058 --- /dev/null +++ b/common/aarch64/predict.h @@ -0,0 +1,52 @@ +/***************************************************************************** + * predict.h: aarch64 intra prediction + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_PREDICT_H +#define X264_AARCH64_PREDICT_H + +void x264_predict_4x4_h_aarch64( uint8_t *src ); +void x264_predict_4x4_v_aarch64( uint8_t *src ); + +// for the merged 4x4 intra sad/satd which expects unified suffix +#define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64 +#define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64 + +void x264_predict_4x4_dc_neon( uint8_t *src ); +void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8c_dc_neon( uint8_t *src ); +void x264_predict_8x8c_h_neon( uint8_t *src ); +void x264_predict_8x8c_v_neon( uint8_t *src ); +void x264_predict_16x16_v_neon( uint8_t *src ); +void x264_predict_16x16_h_neon( uint8_t *src ); +void x264_predict_16x16_dc_neon( uint8_t *src ); + +void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] ); +void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ); +void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] ); +void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] ); + +#endif /* X264_AARCH64_PREDICT_H */ diff --git a/common/pixel.c b/common/pixel.c index 5524af73..478f857f 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -40,6 +40,7 @@ #endif #if ARCH_AARCH64 # include "aarch64/pixel.h" +# include "aarch64/predict.h" #endif @@ -523,14 +524,10 @@ INTRA_MBCMP_8x8(sa8d,, _c ) INTRA_MBCMP_8x8( sad, _mmx2, _c ) INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 ) #endif -#if !HIGH_BIT_DEPTH && HAVE_ARMV6 +#if !HIGH_BIT_DEPTH && (HAVE_ARMV6 || ARCH_AARCH64) INTRA_MBCMP_8x8( sad, _neon, _neon ) INTRA_MBCMP_8x8(sa8d, _neon, _neon ) #endif -#if !HIGH_BIT_DEPTH && ARCH_AARCH64 -INTRA_MBCMP_8x8( sad, _neon, _c ) -INTRA_MBCMP_8x8(sa8d, _neon, _c ) -#endif #define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\ void x264_intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\ @@ -597,14 +594,14 @@ INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon ) INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon ) #endif #if !HIGH_BIT_DEPTH && ARCH_AARCH64 -INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _c ) -INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _c ) -INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _c ) -INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _c ) +INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _neon ) +INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _neon ) +INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon ) +INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon ) INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c ) -INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _c ) -INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _c ) +INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon ) +INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon ) #endif // No C implementation of intra_satd_x9. See checkasm for its behavior, diff --git a/common/predict.c b/common/predict.c index cbc018d3..f9c46152 100644 --- a/common/predict.c +++ b/common/predict.c @@ -40,6 +40,9 @@ #if ARCH_ARM # include "arm/predict.h" #endif +#if ARCH_AARCH64 +# include "aarch64/predict.h" +#endif /**************************************************************************** * 16x16 prediction for intra luma block @@ -899,6 +902,10 @@ void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] ) #if HAVE_ARMV6 x264_predict_16x16_init_arm( cpu, pf ); #endif + +#if ARCH_AARCH64 + x264_predict_16x16_init_aarch64( cpu, pf ); +#endif } void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] ) @@ -923,6 +930,10 @@ void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] ) #if HAVE_ARMV6 x264_predict_8x8c_init_arm( cpu, pf ); #endif + +#if ARCH_AARCH64 + x264_predict_8x8c_init_aarch64( cpu, pf ); +#endif } void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] ) @@ -963,6 +974,10 @@ void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_ #if HAVE_ARMV6 x264_predict_8x8_init_arm( cpu, pf, predict_filter ); #endif + +#if ARCH_AARCH64 + x264_predict_8x8_init_aarch64( cpu, pf, predict_filter ); +#endif } void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] ) @@ -987,5 +1002,9 @@ void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] ) #if HAVE_ARMV6 x264_predict_4x4_init_arm( cpu, pf ); #endif + +#if ARCH_AARCH64 + x264_predict_4x4_init_aarch64( cpu, pf ); +#endif } -- 2.40.0