From: Janne Grunau Date: Thu, 17 Jul 2014 14:58:44 +0000 (+0100) Subject: aarch64: motion compensation NEON asm X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=556b0e7928d14818454e0c33032754f6323f02e9;p=libx264 aarch64: motion compensation NEON asm Ported from the ARM NEON asm. --- diff --git a/Makefile b/Makefile index d903393e..b0d4a149 100644 --- a/Makefile +++ b/Makefile @@ -127,9 +127,10 @@ endif ifeq ($(ARCH),AARCH64) ifneq ($(AS),) ASMSRC += common/aarch64/dct-a.S \ + common/aarch64/mc-a.S \ common/aarch64/pixel-a.S \ common/aarch64/quant-a.S -SRCS += +SRCS += common/aarch64/mc-c.c OBJASM = $(ASMSRC:%.S=%.o) endif endif diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S new file mode 100644 index 00000000..351317e5 --- /dev/null +++ b/common/aarch64/mc-a.S @@ -0,0 +1,1365 @@ +/***************************************************************************** + * mc.S: aarch64 motion compensation + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad + * Janne Grunau + * Mans Rullgard + * Stefan Groenroos + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +// note: prefetch stuff assumes 64-byte cacheline + +// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity ) +function x264_prefetch_ref_aarch64, export=1 + cmp w2, #1 + csel x2, xzr, x1, eq + add x0, x0, #64 + add x0, x0, x2, lsl #3 + + lsl x2, x1, #1 + add x3, x1, x1, lsl #1 + add x4, x0, x1, lsl #2 + + prfm pldl1strm, [x0] + prfm pldl1strm, [x0, x1] + prfm pldl1strm, [x0, x2] + prfm pldl1strm, [x0, x3] + prfm pldl1strm, [x4] + prfm pldl1strm, [x4, x1] + prfm pldl1strm, [x4, x2] + prfm pldl1strm, [x4, x3] + ret +endfunc + +// void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y, +// uint8_t *pix_uv, intptr_t stride_uv, int mb_x ) +.macro x264_prefetch_fenc sub +function x264_prefetch_fenc_\sub\()_aarch64, export=1 + and w6, w5, #3 + and w7, w5, #3 + mul x6, x6, x1 + mul x7, x7, x3 + add x0, x0, #64 + add x2, x2, #64 + + add x0, x0, x6, lsl #2 + add x6, x0, x1, lsl #1 + prfm pldl1strm, [x0] + prfm pldl1strm, [x0, x1] + prfm pldl1strm, [x6] + prfm pldl1strm, [x6, x1] + + add x2, x2, x7, lsl #1 + prfm pldl1strm, [x2] + prfm pldl1strm, [x2, x3] +.ifc \sub, 422 + add x7, x2, x3, lsl #1 + prfm pldl1strm, [x7] + prfm pldl1strm, [x7, x3] +.endif + ret +endfunc +.endm + +x264_prefetch_fenc 420 +x264_prefetch_fenc 422 + +// void pixel_avg( uint8_t *dst, intptr_t dst_stride, +// uint8_t *src1, intptr_t src1_stride, +// uint8_t *src2, intptr_t src2_stride, int weight ); +.macro AVGH w h +function x264_pixel_avg_\w\()x\h\()_neon, export=1 + mov w10, #64 + cmp w6, #32 + mov w9, #\h + b.eq pixel_avg_w\w\()_neon + subs w7, w10, w6 + b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64 + cmp w6, #0 + b.ge pixel_avg_weight_w\w\()_add_add_neon + b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 +endfunc +.endm + +AVGH 4, 2 +AVGH 4, 4 +AVGH 4, 8 +AVGH 4, 16 +AVGH 8, 4 +AVGH 8, 8 +AVGH 8, 16 +AVGH 16, 8 +AVGH 16, 16 + +// 0 < weight < 64 +.macro load_weights_add_add + mov w6, w6 +.endm +.macro weight_add_add dst, s1, s2, h= +.ifc \h, 2 + umull2 \dst, \s1, v30.16b + umlal2 \dst, \s2, v31.16b +.else + umull \dst, \s1, v30.8b + umlal \dst, \s2, v31.8b +.endif +.endm + +// weight > 64 +.macro load_weights_add_sub + neg w7, w7 +.endm +.macro weight_add_sub dst, s1, s2, h= +.ifc \h, 2 + umull2 \dst, \s1, v30.16b + umlsl2 \dst, \s2, v31.16b +.else + umull \dst, \s1, v30.8b + umlsl \dst, \s2, v31.8b +.endif +.endm + +// weight < 0 +.macro load_weights_sub_add + neg w6, w6 +.endm +.macro weight_sub_add dst, s1, s2, h= +.ifc \h, 2 + umull2 \dst, \s2, v31.16b + umlsl2 \dst, \s1, v30.16b +.else + umull \dst, \s2, v31.8b + umlsl \dst, \s1, v30.8b +.endif +.endm + +.macro AVG_WEIGHT ext +function pixel_avg_weight_w4_\ext\()_neon + load_weights_\ext + dup v30.8b, w6 + dup v31.8b, w7 +1: // height loop + subs w9, w9, #2 + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x4], x5 + weight_\ext v4.8h, v0.8b, v1.8b + ld1 {v2.s}[0], [x2], x3 + ld1 {v3.s}[0], [x4], x5 + sqrshrun v0.8b, v4.8h, #6 + weight_\ext v5.8h, v2.8b, v3.8b + st1 {v0.s}[0], [x0], x1 + sqrshrun v1.8b, v5.8h, #6 + st1 {v1.s}[0], [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg_weight_w8_\ext\()_neon + load_weights_\ext + dup v30.8b, w6 + dup v31.8b, w7 +1: // height loop + subs w9, w9, #4 + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x4], x5 + weight_\ext v16.8h, v0.8b, v1.8b + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x4], x5 + weight_\ext v17.8h, v2.8b, v3.8b + ld1 {v4.8b}, [x2], x3 + ld1 {v5.8b}, [x4], x5 + weight_\ext v18.8h, v4.8b, v5.8b + ld1 {v6.8b}, [x2], x3 + ld1 {v7.8b}, [x4], x5 + weight_\ext v19.8h, v6.8b, v7.8b + sqrshrun v0.8b, v16.8h, #6 + sqrshrun v1.8b, v17.8h, #6 + sqrshrun v2.8b, v18.8h, #6 + sqrshrun v3.8b, v19.8h, #6 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg_weight_w16_\ext\()_neon + load_weights_\ext + dup v30.16b, w6 + dup v31.16b, w7 +1: // height loop + subs w9, w9, #2 + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x4], x5 + weight_\ext v16.8h, v0.8b, v1.8b + weight_\ext v17.8h, v0.16b, v1.16b, 2 + ld1 {v2.16b}, [x2], x3 + ld1 {v3.16b}, [x4], x5 + weight_\ext v18.8h, v2.8b, v3.8b + weight_\ext v19.8h, v2.16b, v3.16b, 2 + sqrshrun v0.8b, v16.8h, #6 + sqrshrun v1.8b, v18.8h, #6 + sqrshrun2 v0.16b, v17.8h, #6 + sqrshrun2 v1.16b, v19.8h, #6 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x0], x1 + b.gt 1b + ret +endfunc +.endm + +AVG_WEIGHT add_add +AVG_WEIGHT add_sub +AVG_WEIGHT sub_add + +function pixel_avg_w4_neon +1: subs w9, w9, #2 + ld1 {v0.s}[0], [x2], x3 + ld1 {v2.s}[0], [x4], x5 + urhadd v0.8b, v0.8b, v2.8b + ld1 {v1.s}[0], [x2], x3 + ld1 {v3.s}[0], [x4], x5 + urhadd v1.8b, v1.8b, v3.8b + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg_w8_neon +1: subs w9, w9, #4 + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x4], x5 + ld1 {v2.8b}, [x2], x3 + urhadd v0.8b, v0.8b, v1.8b + ld1 {v3.8b}, [x4], x5 + st1 {v0.8b}, [x0], x1 + ld1 {v4.8b}, [x2], x3 + urhadd v1.8b, v2.8b, v3.8b + ld1 {v5.8b}, [x4], x5 + st1 {v1.8b}, [x0], x1 + ld1 {v6.8b}, [x2], x3 + ld1 {v7.8b}, [x4], x5 + urhadd v2.8b, v4.8b, v5.8b + urhadd v3.8b, v6.8b, v7.8b + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg_w16_neon +1: subs w9, w9, #4 + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x4], x5 + ld1 {v2.16b}, [x2], x3 + urhadd v0.16b, v0.16b, v1.16b + ld1 {v3.16b}, [x4], x5 + st1 {v0.16b}, [x0], x1 + ld1 {v4.16b}, [x2], x3 + urhadd v1.16b, v2.16b, v3.16b + ld1 {v5.16b}, [x4], x5 + st1 {v1.16b}, [x0], x1 + ld1 {v6.16b}, [x2], x3 + ld1 {v7.16b}, [x4], x5 + urhadd v2.16b, v4.16b, v5.16b + urhadd v3.16b, v6.16b, v7.16b + st1 {v2.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_pixel_avg2_w4_neon, export=1 +1: + subs w5, w5, #2 + ld1 {v0.s}[0], [x2], x3 + ld1 {v2.s}[0], [x4], x3 + urhadd v0.8b, v0.8b, v2.8b + ld1 {v1.s}[0], [x2], x3 + ld1 {v3.s}[0], [x4], x3 + urhadd v1.8b, v1.8b, v3.8b + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 + b.gt 1b + ret +endfunc + +function x264_pixel_avg2_w8_neon, export=1 +1: + subs w5, w5, #2 + ld1 {v0.8b}, [x2], x3 + ld1 {v2.8b}, [x4], x3 + urhadd v0.8b, v0.8b, v2.8b + ld1 {v1.8b}, [x2], x3 + ld1 {v3.8b}, [x4], x3 + urhadd v1.8b, v1.8b, v3.8b + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_pixel_avg2_w16_neon, export=1 +1: + subs w5, w5, #2 + ld1 {v0.16b}, [x2], x3 + ld1 {v2.16b}, [x4], x3 + urhadd v0.16b, v0.16b, v2.16b + ld1 {v1.16b}, [x2], x3 + ld1 {v3.16b}, [x4], x3 + urhadd v1.16b, v1.16b, v3.16b + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_pixel_avg2_w20_neon, export=1 + sub x1, x1, #16 +1: + subs w5, w5, #2 + ld1 {v0.16b,v1.16b}, [x2], x3 + ld1 {v2.16b,v3.16b}, [x4], x3 + urhadd v0.16b, v0.16b, v2.16b + urhadd v1.8b, v1.8b, v3.8b + ld1 {v4.16b,v5.16b}, [x2], x3 + ld1 {v6.16b,v7.16b}, [x4], x3 + urhadd v4.16b, v4.16b, v6.16b + urhadd v5.8b, v5.8b, v7.8b + st1 {v0.16b}, [x0], #16 + st1 {v1.s}[0], [x0], x1 + st1 {v4.16b}, [x0], #16 + st1 {v5.s}[0], [x0], x1 + b.gt 1b + ret +endfunc + +.macro weight_prologue type + mov w9, w5 // height +.ifc \type, full + ldr w12, [x4, #32] // denom +.endif + ldp w4, w5, [x4, #32+4] // scale, offset + dup v0.16b, w4 + dup v1.8h, w5 +.ifc \type, full + neg w12, w12 + dup v2.8h, w12 +.endif +.endm + +// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst, +// intptr_t dst_stride, const x264_weight_t *weight, int h ) +function x264_mc_weight_w20_neon, export=1 + weight_prologue full + sub x1, x1, #16 +1: + subs w9, w9, #2 + ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3 + ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3 + umull v22.8h, v16.8b, v0.8b + umull v23.8h, v17.8b, v0.8b + zip1 v18.2s, v18.2s, v21.2s + umull v25.8h, v19.8b, v0.8b + umull v26.8h, v20.8b, v0.8b + umull v24.8h, v18.8b, v0.8b + srshl v22.8h, v22.8h, v2.8h + srshl v23.8h, v23.8h, v2.8h + srshl v24.8h, v24.8h, v2.8h + srshl v25.8h, v25.8h, v2.8h + srshl v26.8h, v26.8h, v2.8h + add v22.8h, v22.8h, v1.8h + add v23.8h, v23.8h, v1.8h + add v24.8h, v24.8h, v1.8h + add v25.8h, v25.8h, v1.8h + add v26.8h, v26.8h, v1.8h + sqxtun v4.8b, v22.8h + sqxtun2 v4.16b, v23.8h + sqxtun v6.8b, v24.8h + sqxtun v5.8b, v25.8h + sqxtun2 v5.16b, v26.8h + st1 {v4.16b}, [x0], #16 + st1 {v6.s}[0], [x0], x1 + st1 {v5.16b}, [x0], #16 + st1 {v6.s}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w16_neon, export=1 + weight_prologue full +weight16_loop: +1: + subs w9, w9, #2 + ld1 {v4.16b}, [x2], x3 + ld1 {v5.16b}, [x2], x3 + umull v22.8h, v4.8b, v0.8b + umull2 v23.8h, v4.16b, v0.16b + umull v24.8h, v5.8b, v0.8b + umull2 v25.8h, v5.16b, v0.16b + srshl v22.8h, v22.8h, v2.8h + srshl v23.8h, v23.8h, v2.8h + srshl v24.8h, v24.8h, v2.8h + srshl v25.8h, v25.8h, v2.8h + add v22.8h, v22.8h, v1.8h + add v23.8h, v23.8h, v1.8h + add v24.8h, v24.8h, v1.8h + add v25.8h, v25.8h, v1.8h + sqxtun v4.8b, v22.8h + sqxtun2 v4.16b, v23.8h + sqxtun v5.8b, v24.8h + sqxtun2 v5.16b, v25.8h + st1 {v4.16b}, [x0], x1 + st1 {v5.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w8_neon, export=1 + weight_prologue full +1: + subs w9, w9, #2 + ld1 {v16.8b}, [x2], x3 + ld1 {v17.8b}, [x2], x3 + umull v4.8h, v16.8b, v0.8b + umull v5.8h, v17.8b, v0.8b + srshl v4.8h, v4.8h, v2.8h + srshl v5.8h, v5.8h, v2.8h + add v4.8h, v4.8h, v1.8h + add v5.8h, v5.8h, v1.8h + sqxtun v16.8b, v4.8h + sqxtun v17.8b, v5.8h + st1 {v16.8b}, [x0], x1 + st1 {v17.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w4_neon, export=1 + weight_prologue full +1: + subs w9, w9, #2 + ld1 {v16.s}[0], [x2], x3 + ld1 {v16.s}[1], [x2], x3 + umull v4.8h, v16.8b, v0.8b + srshl v4.8h, v4.8h, v2.8h + add v4.8h, v4.8h, v1.8h + sqxtun v16.8b, v4.8h + st1 {v16.s}[0], [x0], x1 + st1 {v16.s}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w20_nodenom_neon, export=1 + weight_prologue nodenom + sub x1, x1, #16 +1: + subs w9, w9, #2 + ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3 + mov v27.16b, v1.16b + mov v28.16b, v1.16b + ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3 + mov v31.16b, v1.16b + mov v29.16b, v1.16b + mov v30.16b, v1.16b + zip1 v18.2s, v18.2s, v21.2s + umlal v27.8h, v16.8b, v0.8b + umlal v28.8h, v17.8b, v0.8b + umlal v31.8h, v18.8b, v0.8b + umlal v29.8h, v19.8b, v0.8b + umlal v30.8h, v20.8b, v0.8b + sqxtun v4.8b, v27.8h + sqxtun2 v4.16b, v28.8h + sqxtun v5.8b, v29.8h + sqxtun2 v5.16b, v30.8h + sqxtun v6.8b, v31.8h + st1 {v4.16b}, [x0], #16 + st1 {v6.s}[0], [x0], x1 + st1 {v5.16b}, [x0], #16 + st1 {v6.s}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w16_nodenom_neon, export=1 + weight_prologue nodenom +1: + subs w9, w9, #2 + ld1 {v6.16b}, [x2], x3 + mov v27.16b, v1.16b + mov v28.16b, v1.16b + ld1 {v7.16b}, [x2], x3 + mov v29.16b, v1.16b + mov v30.16b, v1.16b + umlal v27.8h, v6.8b, v0.8b + umlal2 v28.8h, v6.16b, v0.16b + umlal v29.8h, v7.8b, v0.8b + umlal2 v30.8h, v7.16b, v0.16b + sqxtun v4.8b, v27.8h + sqxtun2 v4.16b, v28.8h + sqxtun v5.8b, v29.8h + sqxtun2 v5.16b, v30.8h + st1 {v4.16b}, [x0], x1 + st1 {v5.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w8_nodenom_neon, export=1 + weight_prologue nodenom +1: + subs w9, w9, #2 + ld1 {v16.8b}, [x2], x3 + mov v27.16b, v1.16b + ld1 {v17.8b}, [x2], x3 + mov v29.16b, v1.16b + umlal v27.8h, v16.8b, v0.8b + umlal v29.8h, v17.8b, v0.8b + sqxtun v4.8b, v27.8h + sqxtun v5.8b, v29.8h + st1 {v4.8b}, [x0], x1 + st1 {v5.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w4_nodenom_neon, export=1 + weight_prologue nodenom +1: + subs w9, w9, #2 + ld1 {v16.s}[0], [x2], x3 + ld1 {v16.s}[1], [x2], x3 + mov v27.16b, v1.16b + umlal v27.8h, v16.8b, v0.8b + sqxtun v4.8b, v27.8h + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x0], x1 + b.gt 1b + ret +endfunc + +.macro weight_simple_prologue + ldr w6, [x4] // offset + dup v1.16b, w6 +.endm + +.macro weight_simple name op +function x264_mc_weight_w20_\name\()_neon, export=1 + weight_simple_prologue +1: + subs w5, w5, #2 + ldr s18, [x2, #16] + ld1 {v16.16b}, [x2], x3 + ldr s19, [x2, #16] + ld1 {v17.16b}, [x2], x3 + \op v18.8b, v18.8b, v1.8b + \op v16.16b, v16.16b, v1.16b + \op v19.8b, v19.8b, v1.8b + \op v17.16b, v17.16b, v1.16b + str s18, [x0, #16] + st1 {v16.16b}, [x0], x1 + str s19, [x0, #16] + st1 {v17.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w16_\name\()_neon, export=1 + weight_simple_prologue +1: + subs w5, w5, #2 + ld1 {v16.16b}, [x2], x3 + ld1 {v17.16b}, [x2], x3 + \op v16.16b, v16.16b, v1.16b + \op v17.16b, v17.16b, v1.16b + st1 {v16.16b}, [x0], x1 + st1 {v17.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w8_\name\()_neon, export=1 + weight_simple_prologue +1: + subs w5, w5, #2 + ld1 {v16.8b}, [x2], x3 + ld1 {v17.8b}, [x2], x3 + \op v16.8b, v16.8b, v1.8b + \op v17.8b, v17.8b, v1.8b + st1 {v16.8b}, [x0], x1 + st1 {v17.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w4_\name\()_neon, export=1 + weight_simple_prologue +1: + subs w5, w5, #2 + ld1 {v16.s}[0], [x2], x3 + ld1 {v16.s}[1], [x2], x3 + \op v16.8b, v16.8b, v1.8b + st1 {v16.s}[0], [x0], x1 + st1 {v16.s}[1], [x0], x1 + b.gt 1b + ret +endfunc +.endm + +weight_simple offsetadd, uqadd +weight_simple offsetsub, uqsub + + +// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height ) +function x264_mc_copy_w4_neon, export=1 +1: + subs w4, w4, #4 + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x2], x3 + ld1 {v2.s}[0], [x2], x3 + ld1 {v3.s}[0], [x2], x3 + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_copy_w8_neon, export=1 +1: subs w4, w4, #4 + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x2], x3 + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x2], x3 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_copy_w16_neon, export=1 +1: subs w4, w4, #4 + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x2], x3 + ld1 {v2.16b}, [x2], x3 + ld1 {v3.16b}, [x2], x3 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x0], x1 + st1 {v2.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +// void x264_mc_chroma_neon( uint8_t *dst_u, uint8_t *dst_v, +// intptr_t i_dst_stride, +// uint8_t *src, intptr_t i_src_stride, +// int dx, int dy, int i_width, int i_height ); +function x264_mc_chroma_neon, export=1 + ldr w15, [sp] // height + sbfx x12, x6, #3, #29 // asr(3) and sign extend + sbfx x11, x5, #3, #29 // asr(3) and sign extend + cmp w7, #4 + mul x12, x12, x4 + add x3, x3, x11, lsl #1 + + and w5, w5, #7 + and w6, w6, #7 + + add x3, x3, x12 + + //pld [x3] + //pld [x3, x4] + + b.gt mc_chroma_w8_neon + b.eq mc_chroma_w4_neon +endfunc + +.macro CHROMA_MC_START r00, r01, r10, r11 + mul w12, w5, w6 // cD = d8x *d8y + lsl w13, w5, #3 + add w9, w12, #64 + lsl w14, w6, #3 + tst w12, w12 + sub w9, w9, w13 + sub w10, w13, w12 // cB = d8x *(8-d8y); + sub w11, w14, w12 // cC = (8-d8x)*d8y + sub w9, w9, w14 // cA = (8-d8x)*(8-d8y); +.endm + +.macro CHROMA_MC width, vsize +function mc_chroma_w\width\()_neon +// since the element size varies, there's a different index for the 2nd store +.if \width == 4 + .set st2, 1 +.else + .set st2, 2 +.endif + CHROMA_MC_START + b.eq 2f + + ld2 {v28.8b,v29.8b}, [x3], x4 + dup v0.8b, w9 // cA + dup v1.8b, w10 // cB + + ext v6.8b, v28.8b, v6.8b, #1 + ext v7.8b, v29.8b, v7.8b, #1 + + ld2 {v30.8b,v31.8b}, [x3], x4 + dup v2.8b, w11 // cC + dup v3.8b, w12 // cD + + ext v22.8b, v30.8b, v22.8b, #1 + ext v23.8b, v31.8b, v23.8b, #1 + + trn1 v0.2s, v0.2s, v1.2s + trn1 v2.2s, v2.2s, v3.2s + + trn1 v4.2s, v28.2s, v6.2s + trn1 v5.2s, v29.2s, v7.2s + trn1 v20.2s, v30.2s, v22.2s + trn1 v21.2s, v31.2s, v23.2s +1: // height loop, interpolate xy + subs w15, w15, #2 + umull v16.8h, v4.8b, v0.8b + umlal v16.8h, v20.8b, v2.8b + umull v17.8h, v5.8b, v0.8b + umlal v17.8h, v21.8b, v2.8b + + ld2 {v28.8b,v29.8b}, [x3], x4 + transpose v24.2d, v25.2d, v16.2d, v17.2d + + ext v6.8b, v28.8b, v6.8b, #1 + ext v7.8b, v29.8b, v7.8b, #1 + + trn1 v4.2s, v28.2s, v6.2s + trn1 v5.2s, v29.2s, v7.2s + + add v16.8h, v24.8h, v25.8h + + umull v18.8h, v20.8b, v0.8b + umlal v18.8h, v4.8b, v2.8b + umull v19.8h, v21.8b, v0.8b + umlal v19.8h, v5.8b, v2.8b + + ld2 {v30.8b,v31.8b}, [x3], x4 + transpose v26.2d, v27.2d, v18.2d, v19.2d + + ext v22.8b, v30.8b, v22.8b, #1 + ext v23.8b, v31.8b, v23.8b, #1 + trn1 v20.2s, v30.2s, v22.2s + trn1 v21.2s, v31.2s, v23.2s + + add v17.8h, v26.8h, v27.8h + + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + + //pld [x3] + //pld [x3, x4] + + st1 {v16.\vsize}[0], [x0], x2 + st1 {v16.\vsize}[st2], [x1], x2 + st1 {v17.\vsize}[0], [x0], x2 + st1 {v17.\vsize}[st2], [x1], x2 + b.gt 1b + + ret +2: // dx or dy are 0 + tst w11, w11 + add w10, w10, w11 + dup v0.8b, w9 + dup v1.8b, w10 + + b.eq 4f + + ld1 {v4.8b}, [x3], x4 + ld1 {v6.8b}, [x3], x4 +3: // vertical interpolation loop + subs w15, w15, #2 + umull v16.8h, v4.8b, v0.8b + ld1 {v4.8b}, [x3], x4 + umlal v16.8h, v6.8b, v1.8b + umull v17.8h, v6.8b, v0.8b + ld1 {v6.8b}, [x3], x4 + umlal v17.8h, v4.8b, v1.8b + + rshrn v20.8b, v16.8h, #6 // uvuvuvuv + rshrn v21.8b, v17.8h, #6 // uvuvuvuv + + uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv + uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv + + //pld [x3] + //pld [x3, x4] + + st1 {v16.\vsize}[0], [x0], x2 + st1 {v16.\vsize}[st2], [x0], x2 + st1 {v17.\vsize}[0], [x1], x2 + st1 {v17.\vsize}[st2], [x1], x2 + b.gt 3b + + ret + +4: // dy is 0 + ld1 {v4.8b,v5.8b}, [x3], x4 + ld1 {v6.8b,v7.8b}, [x3], x4 + + ext v5.8b, v4.8b, v5.8b, #2 + ext v7.8b, v6.8b, v7.8b, #2 +5: // horizontal interpolation loop + subs w15, w15, #2 + umull v16.8h, v4.8b, v0.8b + umlal v16.8h, v5.8b, v1.8b + umull v17.8h, v6.8b, v0.8b + umlal v17.8h, v7.8b, v1.8b + + ld1 {v4.8b,v5.8b}, [x3], x4 + ld1 {v6.8b,v7.8b}, [x3], x4 + rshrn v20.8b, v16.8h, #6 + rshrn v21.8b, v17.8h, #6 + ext v5.8b, v4.8b, v5.8b, #2 + ext v7.8b, v6.8b, v7.8b, #2 + uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv + uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv + + //pld [x3] + //pld [x3, x4] + + st1 {v16.\vsize}[0], [x0], x2 + st1 {v16.\vsize}[st2], [x0], x2 + st1 {v17.\vsize}[0], [x1], x2 + st1 {v17.\vsize}[st2], [x1], x2 + b.gt 5b + + ret +endfunc +.endm + + CHROMA_MC 2, h + CHROMA_MC 4, s + +function mc_chroma_w8_neon + CHROMA_MC_START + b.eq 2f + ld2 {v4.16b,v5.16b}, [x3], x4 + ld2 {v20.16b,v21.16b}, [x3], x4 + dup v0.8b, w9 // cA + dup v1.8b, w10 // cB + + ext v6.16b, v4.16b, v4.16b, #1 + ext v7.16b, v5.16b, v5.16b, #1 + + dup v2.8b, w11 // cC + dup v3.8b, w12 // cD + + ext v22.16b, v20.16b, v20.16b, #1 + ext v23.16b, v21.16b, v21.16b, #1 + +1: // height loop, interpolate xy + subs w15, w15, #2 + umull v16.8h, v4.8b, v0.8b + umlal v16.8h, v6.8b, v1.8b + umlal v16.8h, v20.8b, v2.8b + umlal v16.8h, v22.8b, v3.8b + + umull v17.8h, v5.8b, v0.8b + umlal v17.8h, v7.8b, v1.8b + umlal v17.8h, v21.8b, v2.8b + umlal v17.8h, v23.8b, v3.8b + + ld2 {v4.16b,v5.16b}, [x3], x4 + + ext v6.16b, v4.16b, v4.16b, #1 + ext v7.16b, v5.16b, v5.16b, #1 + + umull v18.8h, v20.8b, v0.8b + umlal v18.8h, v22.8b, v1.8b + umlal v18.8h, v4.8b, v2.8b + umlal v18.8h, v6.8b, v3.8b + + umull v19.8h, v21.8b, v0.8b + umlal v19.8h, v23.8b, v1.8b + umlal v19.8h, v5.8b, v2.8b + umlal v19.8h, v7.8b, v3.8b + + ld2 {v20.16b,v21.16b}, [x3], x4 + + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + rshrn v18.8b, v18.8h, #6 + rshrn v19.8b, v19.8h, #6 + + ext v22.16b, v20.16b, v20.16b, #1 + ext v23.16b, v21.16b, v21.16b, #1 + + //pld [x3] + //pld [x3, x4] + + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x1], x2 + st1 {v18.8b}, [x0], x2 + st1 {v19.8b}, [x1], x2 + b.gt 1b + + ret +2: // dx or dy are 0 + tst w11, w11 + add w10, w10, w11 + dup v0.8b, w9 + dup v1.8b, w10 + + b.eq 4f + + ld2 {v4.8b,v5.8b}, [x3], x4 + ld2 {v6.8b,v7.8b}, [x3], x4 +3: // vertical interpolation loop + subs w15, w15, #2 + umull v16.8h, v4.8b, v0.8b //U + umlal v16.8h, v6.8b, v1.8b + umull v17.8h, v5.8b, v0.8b //V + umlal v17.8h, v7.8b, v1.8b + + ld2 {v4.8b,v5.8b}, [x3], x4 + + umull v18.8h, v6.8b, v0.8b + umlal v18.8h, v4.8b, v1.8b + umull v19.8h, v7.8b, v0.8b + umlal v19.8h, v5.8b, v1.8b + + ld2 {v6.8b,v7.8b}, [x3], x4 + + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + rshrn v18.8b, v18.8h, #6 + rshrn v19.8b, v19.8h, #6 + + //pld [x3] + //pld [x3, x4] + + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x1], x2 + st1 {v18.8b}, [x0], x2 + st1 {v19.8b}, [x1], x2 + b.gt 3b + + ret +4: // dy is 0 + ld2 {v4.16b,v5.16b}, [x3], x4 + ext v6.16b, v4.16b, v4.16b, #1 + ext v7.16b, v5.16b, v5.16b, #1 + ld2 {v20.16b,v21.16b}, [x3], x4 + ext v22.16b, v20.16b, v20.16b, #1 + ext v23.16b, v21.16b, v21.16b, #1 +5: // horizontal interpolation loop + subs w15, w15, #2 + umull v16.8h, v4.8b, v0.8b //U + umlal v16.8h, v6.8b, v1.8b + umull v17.8h, v5.8b, v0.8b //V + umlal v17.8h, v7.8b, v1.8b + + ld2 {v4.16b,v5.16b}, [x3], x4 + + umull v18.8h, v20.8b, v0.8b + umlal v18.8h, v22.8b, v1.8b + umull v19.8h, v21.8b, v0.8b + umlal v19.8h, v23.8b, v1.8b + + ld2 {v20.16b,v21.16b}, [x3], x4 + + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + rshrn v18.8b, v18.8h, #6 + rshrn v19.8b, v19.8h, #6 + + ext v6.16b, v4.16b, v4.16b, #1 + ext v7.16b, v5.16b, v5.16b, #1 + ext v22.16b, v20.16b, v20.16b, #1 + ext v23.16b, v21.16b, v21.16b, #1 + + //pld [x3] + //pld [x3, x4] + + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x1], x2 + st1 {v18.8b}, [x0], x2 + st1 {v19.8b}, [x1], x2 + b.gt 5b + + ret +endfunc + +//void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, +// intptr_t stride, int width, int height, int16_t *buf ) +function x264_hpel_filter_neon, export=1 + ubfm x9, x3, #0, #3 + add w15, w5, w9 + sub x13, x3, x9 // align src + sub x10, x0, x9 + sub x11, x1, x9 + sub x12, x2, x9 + movi v30.16b, #5 + movi v31.16b, #20 +1: // line start + mov x3, x13 + mov x2, x12 + mov x1, x11 + mov x0, x10 + add x7, x3, #16 // src pointer next 16b for horiz filter + mov x5, x15 // restore width + sub x3, x3, x4, lsl #1 // src - 2*stride + ld1 {v28.16b}, [x7], #16 // src[16:31] + + add x9, x3, x5 // holds src - 2*stride + width + + ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] + ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] + ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] + ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] + ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] + ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] + + ext v22.16b, v7.16b, v18.16b, #14 + uaddl v1.8h, v16.8b, v21.8b + ext v26.16b, v18.16b, v28.16b, #3 + umlsl v1.8h, v17.8b, v30.8b + ext v23.16b, v7.16b, v18.16b, #15 + umlal v1.8h, v18.8b, v31.8b + ext v24.16b, v18.16b, v28.16b, #1 + umlal v1.8h, v19.8b, v31.8b + ext v25.16b, v18.16b, v28.16b, #2 + umlsl v1.8h, v20.8b, v30.8b +2: // next 16 pixel of line + subs x5, x5, #16 + sub x3, x9, x5 // src - 2*stride += 16 + + uaddl v4.8h, v22.8b, v26.8b + uaddl2 v5.8h, v22.16b, v26.16b + sqrshrun v6.8b, v1.8h, #5 + umlsl v4.8h, v23.8b, v30.8b + umlsl2 v5.8h, v23.16b, v30.16b + umlal v4.8h, v18.8b, v31.8b + umlal2 v5.8h, v18.16b, v31.16b + umlal v4.8h, v24.8b, v31.8b + umlal2 v5.8h, v24.16b, v31.16b + umlsl v4.8h, v25.8b, v30.8b + umlsl2 v5.8h, v25.16b, v30.16b + + uaddl2 v2.8h, v16.16b, v21.16b + sqrshrun v4.8b, v4.8h, #5 + mov v7.16b, v18.16b + sqrshrun2 v4.16b, v5.8h, #5 + + umlsl2 v2.8h, v17.16b, v30.16b + ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] + umlal2 v2.8h, v18.16b, v31.16b + ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] + umlal2 v2.8h, v19.16b, v31.16b + ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] + umlsl2 v2.8h, v20.16b, v30.16b + ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] + st1 {v4.16b}, [x0], #16 + sqrshrun2 v6.16b, v2.8h, #5 + ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] + ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] + + ext v22.16b, v0.16b, v1.16b, #12 + ext v26.16b, v1.16b, v2.16b, #6 + ext v23.16b, v0.16b, v1.16b, #14 + st1 {v6.16b}, [x1], #16 + uaddl v3.8h, v16.8b, v21.8b + ext v25.16b, v1.16b, v2.16b, #4 + umlsl v3.8h, v17.8b, v30.8b + ext v24.16b, v1.16b, v2.16b, #2 + + umlal v3.8h, v18.8b, v31.8b + add v4.8h, v22.8h, v26.8h + umlal v3.8h, v19.8b, v31.8b + add v5.8h, v23.8h, v25.8h + umlsl v3.8h, v20.8b, v30.8b + add v6.8h, v24.8h, v1.8h + + ext v22.16b, v1.16b, v2.16b, #12 + ext v26.16b, v2.16b, v3.16b, #6 + ext v23.16b, v1.16b, v2.16b, #14 + ext v25.16b, v2.16b, v3.16b, #4 + ext v24.16b, v2.16b, v3.16b, #2 + + add v22.8h, v22.8h, v26.8h + add v23.8h, v23.8h, v25.8h + add v24.8h, v24.8h, v2.8h + + sub v4.8h, v4.8h, v5.8h // a-b + sub v5.8h, v5.8h, v6.8h // b-c + + sub v22.8h, v22.8h, v23.8h // a-b + sub v23.8h, v23.8h, v24.8h // b-c + + sshr v4.8h, v4.8h, #2 // (a-b)/4 + sshr v22.8h, v22.8h, #2 // (a-b)/4 + sub v4.8h, v4.8h, v5.8h // (a-b)/4-b+c + sub v22.8h, v22.8h, v23.8h // (a-b)/4-b+c + sshr v4.8h, v4.8h, #2 // ((a-b)/4-b+c)/4 + sshr v22.8h, v22.8h, #2 // ((a-b)/4-b+c)/4 + add v4.8h, v4.8h, v6.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + add v22.8h, v22.8h, v24.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + + sqrshrun v4.8b, v4.8h, #6 + ld1 {v28.16b}, [x7], #16 // src[16:31] + mov v0.16b, v2.16b + ext v23.16b, v7.16b, v18.16b, #15 + sqrshrun2 v4.16b, v22.8h, #6 + mov v1.16b, v3.16b + ext v22.16b, v7.16b, v18.16b, #14 + ext v24.16b, v18.16b, v28.16b, #1 + ext v25.16b, v18.16b, v28.16b, #2 + ext v26.16b, v18.16b, v28.16b, #3 + + st1 {v4.16b}, [x2], #16 + b.gt 2b + + subs w6, w6, #1 + add x10, x10, x4 + add x11, x11, x4 + add x12, x12, x4 + add x13, x13, x4 + b.gt 1b + + ret +endfunc + +// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, +// uint8_t *dstv, uint8_t *dstc, intptr_t src_stride, +// intptr_t dst_stride, int width, int height ) +function x264_frame_init_lowres_core_neon, export=1 + ldr w8, [sp] + sub x10, x6, w7, uxtw // dst_stride - width + and x10, x10, #~15 + +1: + mov w9, w7 // width + mov x11, x0 // src0 + add x12, x0, x5 // src1 = src0 + src_stride + add x13, x0, x5, lsl #1 // src2 = src1 + src_stride + + ld2 {v0.16b,v1.16b}, [x11], #32 + ld2 {v2.16b,v3.16b}, [x12], #32 + ld2 {v4.16b,v5.16b}, [x13], #32 + + urhadd v20.16b, v0.16b, v2.16b // s0[2x] + s1[2x] + urhadd v22.16b, v2.16b, v4.16b // s1[2x] + s2[2x] +2: + subs w9, w9, #16 + urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1] + urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1] + + ld2 {v0.16b,v1.16b}, [x11], #32 + ld2 {v2.16b,v3.16b}, [x12], #32 + ld2 {v4.16b,v5.16b}, [x13], #32 + urhadd v30.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x] + urhadd v31.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x] + ext v24.16b, v20.16b, v30.16b, #1 // s0[2x+2] + s1[2x+2] + ext v25.16b, v22.16b, v31.16b, #1 // s1[2x+2] + s2[2x+2] + + urhadd v16.16b, v20.16b, v21.16b + urhadd v18.16b, v22.16b, v23.16b + urhadd v17.16b, v21.16b, v24.16b + urhadd v19.16b, v23.16b, v25.16b + + st1 {v16.16b}, [x1], #16 + st1 {v18.16b}, [x3], #16 + st1 {v17.16b}, [x2], #16 + st1 {v19.16b}, [x4], #16 + b.le 3f + + subs w9, w9, #16 + urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1] + urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1] + + ld2 {v0.16b,v1.16b}, [x11], #32 + ld2 {v2.16b,v3.16b}, [x12], #32 + ld2 {v4.16b,v5.16b}, [x13], #32 + urhadd v20.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x] + urhadd v22.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x] + ext v24.16b, v30.16b, v20.16b, #1 // s0[2x+2] + s1[2x+2] + ext v25.16b, v31.16b, v22.16b, #1 // s1[2x+2] + s2[2x+2] + + urhadd v16.16b, v30.16b, v21.16b + urhadd v18.16b, v31.16b, v23.16b + urhadd v17.16b, v21.16b, v24.16b + urhadd v19.16b, v23.16b, v25.16b + + st1 {v16.16b}, [x1], #16 + st1 {v18.16b}, [x3], #16 + st1 {v17.16b}, [x2], #16 + st1 {v19.16b}, [x4], #16 + b.gt 2b +3: + subs w8, w8, #1 + add x0, x0, x5, lsl #1 + add x1, x1, x10 + add x2, x2, x10 + add x3, x3, x10 + add x4, x4, x10 + b.gt 1b + + ret +endfunc + +function x264_load_deinterleave_chroma_fenc_neon, export=1 + mov x4, #FENC_STRIDE/2 + b load_deinterleave_chroma +endfunc + +function x264_load_deinterleave_chroma_fdec_neon, export=1 + mov x4, #FDEC_STRIDE/2 +load_deinterleave_chroma: + ld2 {v0.8b,v1.8b}, [x1], x2 + ld2 {v2.8b,v3.8b}, [x1], x2 + subs w3, w3, #2 + st1 {v0.8b}, [x0], x4 + st1 {v1.8b}, [x0], x4 + st1 {v2.8b}, [x0], x4 + st1 {v3.8b}, [x0], x4 + b.gt load_deinterleave_chroma + + ret +endfunc + +function x264_plane_copy_deinterleave_neon, export=1 + add w9, w6, #15 + and w9, w9, #0xfffffff0 + sub x1, x1, x9 + sub x3, x3, x9 + sub x5, x5, x9, lsl #1 +1: + ld2 {v0.16b,v1.16b}, [x4], #32 + subs w9, w9, #16 + st1 {v0.16b}, [x0], #16 + st1 {v1.16b}, [x2], #16 + b.gt 1b + + add x4, x4, x5 + subs w7, w7, #1 + add x0, x0, x1 + add x2, x2, x3 + mov w9, w6 + b.gt 1b + + ret +endfunc + +.macro deinterleave_rgb + subs x11, x11, #8 + st1 {v0.8b}, [x0], #8 + st1 {v1.8b}, [x2], #8 + st1 {v2.8b}, [x4], #8 + b.gt 1b + + subs w10, w10, #1 + add x0, x0, x1 + add x2, x2, x3 + add x4, x4, x5 + add x6, x6, x7 + mov x11, x9 + b.gt 1b +.endm + +function x264_plane_copy_deinterleave_rgb_neon, export=1 +#if SYS_MACOSX + ldr w8, [sp] + ldp w9, w10, [sp, #4] +#else + ldr x8, [sp] + ldp x9, x10, [sp, #8] +#endif + cmp w8, #3 + uxtw x9, w9 + add x11, x9, #7 + and x11, x11, #~7 + sub x1, x1, x11 + sub x3, x3, x11 + sub x5, x5, x11 + b.ne 4f + sub x7, x7, x11, lsl #1 + sub x7, x7, x11 +1: + ld3 {v0.8b,v1.8b,v2.8b}, [x6], #24 + deinterleave_rgb + + ret +4: + sub x7, x7, x11, lsl #2 +1: + ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32 + deinterleave_rgb + + ret +endfunc + +function x264_plane_copy_interleave_neon, export=1 + add w9, w6, #15 + and w9, w9, #0xfffffff0 + sub x1, x1, x9, lsl #1 + sub x3, x3, x9 + sub x5, x5, x9 +1: + ld1 {v0.16b}, [x2], #16 + ld1 {v1.16b}, [x4], #16 + subs w9, w9, #16 + st2 {v0.16b,v1.16b}, [x0], #32 + b.gt 1b + + subs w7, w7, #1 + add x0, x0, x1 + add x2, x2, x3 + add x4, x4, x5 + mov w9, w6 + b.gt 1b + + ret +endfunc + +function x264_store_interleave_chroma_neon, export=1 + mov x5, #FDEC_STRIDE +1: + ld1 {v0.8b}, [x2], x5 + ld1 {v1.8b}, [x3], x5 + ld1 {v2.8b}, [x2], x5 + ld1 {v3.8b}, [x3], x5 + subs w4, w4, #2 + zip1 v4.16b, v0.16b, v1.16b + zip1 v5.16b, v2.16b, v3.16b + st1 {v4.16b}, [x0], x1 + st1 {v5.16b}, [x0], x1 + b.gt 1b + + ret +endfunc diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c new file mode 100644 index 00000000..73f6df96 --- /dev/null +++ b/common/aarch64/mc-c.c @@ -0,0 +1,249 @@ +/***************************************************************************** + * mc-c.c: aarch64 motion compensation + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "mc.h" + +void x264_prefetch_ref_aarch64( uint8_t *, intptr_t, int ); +void x264_prefetch_fenc_420_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_prefetch_fenc_422_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n ); +void x264_memzero_aligned_neon( void *dst, size_t n ); + +void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); + +void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, + pixel *dstv, intptr_t i_dstv, + pixel *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta, + pixel *dstb, intptr_t i_dstb, + pixel *dstc, intptr_t i_dstc, + pixel *src, intptr_t i_src, int pw, int w, int h ); +void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); + +void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); +void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); + +#define MC_WEIGHT(func)\ +void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +\ +static void (* x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\ +{\ + x264_mc_weight_w4##func##_neon,\ + x264_mc_weight_w4##func##_neon,\ + x264_mc_weight_w8##func##_neon,\ + x264_mc_weight_w16##func##_neon,\ + x264_mc_weight_w16##func##_neon,\ + x264_mc_weight_w20##func##_neon,\ +}; + +MC_WEIGHT() +MC_WEIGHT(_nodenom) +MC_WEIGHT(_offsetadd) +MC_WEIGHT(_offsetsub) + +void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); +void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); + +#if !HIGH_BIT_DEPTH +static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) +{ + if( w->i_scale == 1<i_denom ) + { + if( w->i_offset < 0 ) + { + w->weightfn = x264_mc_offsetsub_wtab_neon; + w->cachea[0] = -w->i_offset; + } + else + { + w->weightfn = x264_mc_offsetadd_wtab_neon; + w->cachea[0] = w->i_offset; + } + } + else if( !w->i_denom ) + w->weightfn = x264_mc_nodenom_wtab_neon; + else + w->weightfn = x264_mc_wtab_neon; +} + +static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) = +{ + NULL, + x264_pixel_avg2_w4_neon, + x264_pixel_avg2_w8_neon, + x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function + x264_pixel_avg2_w16_neon, + x264_pixel_avg2_w20_neon, +}; + +static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) = +{ + NULL, + x264_mc_copy_w4_neon, + x264_mc_copy_w8_neon, + NULL, + x264_mc_copy_w16_neon, +}; + +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; + +static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride, + uint8_t *src[4], intptr_t i_src_stride, + int mvx, int mvy, + int i_width, int i_height, const x264_weight_t *weight ) +{ + int qpel_idx = ((mvy&3)<<2) + (mvx&3); + intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); + uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; + if ( (mvy&3) == 3 ) // explict if() to force conditional add + src1 += i_src_stride; + + if( qpel_idx & 5 ) /* qpel interpolation needed */ + { + uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + x264_pixel_avg_wtab_neon[i_width>>2]( + dst, i_dst_stride, src1, i_src_stride, + src2, i_height ); + if( weight->weightfn ) + weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height ); + } + else if( weight->weightfn ) + weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height ); + else + x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height ); +} + +static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride, + uint8_t *src[4], intptr_t i_src_stride, + int mvx, int mvy, + int i_width, int i_height, const x264_weight_t *weight ) +{ + int qpel_idx = ((mvy&3)<<2) + (mvx&3); + intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); + uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; + if ( (mvy&3) == 3 ) // explict if() to force conditional add + src1 += i_src_stride; + + if( qpel_idx & 5 ) /* qpel interpolation needed */ + { + uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + x264_pixel_avg_wtab_neon[i_width>>2]( + dst, *i_dst_stride, src1, i_src_stride, + src2, i_height ); + if( weight->weightfn ) + weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height ); + return dst; + } + else if( weight->weightfn ) + { + weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height ); + return dst; + } + else + { + *i_dst_stride = i_src_stride; + return src1; + } +} + +void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, + uint8_t *src, intptr_t stride, int width, + int height, int16_t *buf ); +#endif // !HIGH_BIT_DEPTH + +void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf ) +{ +#if !HIGH_BIT_DEPTH + if( cpu&X264_CPU_ARMV8 ) + { + pf->prefetch_fenc_420 = x264_prefetch_fenc_420_aarch64; + pf->prefetch_fenc_422 = x264_prefetch_fenc_422_aarch64; + pf->prefetch_ref = x264_prefetch_ref_aarch64; + } + + if( !(cpu&X264_CPU_NEON) ) + return; + + pf->copy_16x16_unaligned = x264_mc_copy_w16_neon; + pf->copy[PIXEL_16x16] = x264_mc_copy_w16_neon; + pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; + pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; + + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; + pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; + pf->plane_copy_interleave = x264_plane_copy_interleave_neon; + + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon; + pf->store_interleave_chroma = x264_store_interleave_chroma_neon; + + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon; + pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon; + pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon; + pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon; + pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon; + + pf->weight = x264_mc_wtab_neon; + pf->offsetadd = x264_mc_offsetadd_wtab_neon; + pf->offsetsub = x264_mc_offsetsub_wtab_neon; + pf->weight_cache = x264_weight_cache_neon; + + pf->mc_chroma = x264_mc_chroma_neon; + pf->mc_luma = mc_luma_neon; + pf->get_ref = get_ref_neon; + pf->hpel_filter = x264_hpel_filter_neon; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon; +#endif // !HIGH_BIT_DEPTH +} diff --git a/common/aarch64/mc.h b/common/aarch64/mc.h new file mode 100644 index 00000000..feba321b --- /dev/null +++ b/common/aarch64/mc.h @@ -0,0 +1,29 @@ +/***************************************************************************** + * mc.h: aarch64 motion compensation + ***************************************************************************** + * Copyright (C) 2014 x264 project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_MC_H +#define X264_AARCH64_MC_H + +void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf ); + +#endif diff --git a/common/mc.c b/common/mc.c index 6797f0ac..6a8b1b81 100644 --- a/common/mc.c +++ b/common/mc.c @@ -35,6 +35,9 @@ #if ARCH_ARM #include "arm/mc.h" #endif +#if ARCH_AARCH64 +#include "aarch64/mc.h" +#endif static inline void pixel_avg( pixel *dst, intptr_t i_dst_stride, @@ -641,6 +644,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ) #if HAVE_ARMV6 x264_mc_init_arm( cpu, pf ); #endif +#if ARCH_AARCH64 + x264_mc_init_aarch64( cpu, pf ); +#endif if( cpu_independent ) {