From: David Conrad Date: Sun, 23 Aug 2009 06:55:29 +0000 (-0700) Subject: GSOC merge part 3: ARM NEON pixel assembly functions X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=52f9719b4c3e58aaa6cbd6d83950444e022aefea;p=libx264 GSOC merge part 3: ARM NEON pixel assembly functions SAD, SADX3/X4, SSD, SATD, SA8D, Hadamard_AC, VAR, VAR2, SSIM --- diff --git a/Makefile b/Makefile index 725c9190..2b3d029f 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ endif # NEON optims ifeq ($(ARCH),ARM) ifneq ($(AS),) -ASMSRC += common/arm/cpu-a.S +ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S OBJASM = $(ASMSRC:%.S=%.o) endif endif diff --git a/common/arm/asm.S b/common/arm/asm.S new file mode 100644 index 00000000..529fa0ce --- /dev/null +++ b/common/arm/asm.S @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + + .macro require8, val=1 + .eabi_attribute 24, \val + .endm + + .macro preserve8, val=1 + .eabi_attribute 25, \val + .endm + + .macro function name, export=0 +.if \export + .global \name +.endif + .type \name, %function + .func \name +\name: + .endm + + .macro movrel rd, val +#if defined(HAVE_ARMV6T2) && !defined(PIC) + movw \rd, #:lower16:\val + movt \rd, #:upper16:\val +#else + ldr \rd, =\val +#endif + .endm + +#define FENC_STRIDE 16 +#define FDEC_STRIDE 32 + +.macro HORIZ_ADD dest, a, b +.ifnb \b + vadd.u16 \a, \a, \b +.endif + vpaddl.u16 \a, \a + vpaddl.u32 \dest, \a +.endm + +.macro SUMSUB_AB sum, diff, a, b + vadd.s16 \sum, \a, \b + vsub.s16 \diff, \a, \b +.endm + +.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d + SUMSUB_AB \s1, \d1, \a, \b + SUMSUB_AB \s2, \d2, \c, \d +.endm + +.macro ABS2 a b + vabs.s16 \a, \a + vabs.s16 \b, \b +.endm + +// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes) +// op = sumsub/amax (sum and diff / maximum of absolutes) +// d1/2 = destination registers +// s1/2 = source registers +.macro HADAMARD dist, op, d1, d2, s1, s2 +.if \dist == 1 + vtrn.16 \s1, \s2 +.else + vtrn.32 \s1, \s2 +.endif +.ifc \op, sumsub + SUMSUB_AB \d1, \d2, \s1, \s2 +.else + vabs.s16 \s1, \s1 + vabs.s16 \s2, \s2 + vmax.s16 \d1, \s1, \s2 +.endif +.endm diff --git a/common/arm/cpu-a.S b/common/arm/cpu-a.S new file mode 100644 index 00000000..ccde3bb4 --- /dev/null +++ b/common/arm/cpu-a.S @@ -0,0 +1,106 @@ +/***************************************************************************** + * cpu-a.S: h264 encoder library + ***************************************************************************** + * Copyright (C) 2009 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + *****************************************************************************/ + +#include "asm.S" + +.fpu neon +.align + +// done in gas because .fpu neon overrides the refusal to assemble +// instructions the selected -march/-mcpu doesn't support +function x264_cpu_neon_test, export=1 + vadd.i16 q0, q0, q0 + bx lr +.endfunc + +// return: 0 on success +// 1 if counters were already enabled +// 9 if lo-res counters were already enabled +function x264_cpu_enable_armv7_counter + mrc p15, 0, r2, c9, c12, 0 // read PMNC + ands r0, r2, #1 + andne r0, r2, #9 + + orr r2, r2, #1 // enable counters + bic r2, r2, #8 // full resolution + mcreq p15, 0, r2, c9, c12, 0 // write PMNC + mov r2, #1 << 31 // enable cycle counter + mcr p15, 0, r2, c9, c12, 1 // write CNTENS + bx lr +.endfunc + +function x264_cpu_disable_armv7_counter + mrc p15, 0, r0, c9, c12, 0 // read PMNC + bic r0, r0, #1 // disable counters + mcr p15, 0, r0, c9, c12, 0 // write PMNC + bx lr +.endfunc + + +.macro READ_TIME r + mrc p15, 0, \r, c9, c13, 0 +.endm + +// return: 0 if transfers neon -> arm transfers take more than 10 cycles +// nonzero otherwise +function x264_cpu_fast_neon_mrc_test, export=1 + // check for user access to performance counters + mrc p15, 0, r0, c9, c14, 0 + cmp r0, #0 + bxeq lr + + push {r4-r6,lr} + bl x264_cpu_enable_armv7_counter + ands r1, r0, #8 + mov r3, #0 + mov ip, #4 + mov r6, #4 + moveq r5, #1 + movne r5, #64 + +average_loop: + mov r4, r5 + READ_TIME r1 +1: subs r4, r4, #1 +.rept 8 + vmov.u32 lr, d0[0] + add lr, lr, lr +.endr + bgt 1b + READ_TIME r2 + + subs r6, r6, #1 + sub r2, r2, r1 + cmpgt r2, #30 << 3 // assume context switch if it took over 30 cycles + addle r3, r3, r2 + subles ip, ip, #1 + bgt average_loop + + // disable counters if we enabled them + ands r0, r0, #1 + bleq x264_cpu_disable_armv7_counter + + lsr r0, r3, #5 + cmp r0, #10 + movgt r0, #0 + pop {r4-r6,pc} +.endfunc diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S new file mode 100644 index 00000000..d9854875 --- /dev/null +++ b/common/arm/pixel-a.S @@ -0,0 +1,1250 @@ +/***************************************************************************** + * pixel.S: h264 encoder + ***************************************************************************** + * Copyright (C) 2009 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + *****************************************************************************/ + +#include "asm.S" + +.fpu neon +.section .rodata +.align 4 + +.rept 16 .byte 0xff +.endr +mask_ff: +.rept 16 .byte 0 +.endr + +mask_ac4: +.short 0, -1, -1, -1, 0, -1, -1, -1 +mask_ac8: +.short 0, -1, -1, -1, -1, -1, -1, -1 + +.text + +.macro SAD4_ARMV6 h +function x264_pixel_sad_4x\h\()_armv6, export=1 + push {r4-r6,lr} + ldr r4, [r2], r3 + ldr r5, [r0], r1 + ldr r6, [r2], r3 + ldr lr, [r0], r1 + usad8 ip, r4, r5 +.rept (\h - 2)/2 + ldr r4, [r2], r3 + ldr r5, [r0], r1 + usada8 ip, r6, lr, ip + ldr r6, [r2], r3 + ldr lr, [r0], r1 + usada8 ip, r4, r5, ip +.endr + usada8 r0, r6, lr, ip + pop {r4-r6,pc} +.endfunc +.endm + +SAD4_ARMV6 4 +SAD4_ARMV6 8 + + +.macro SAD_START_4 align:vararg + vld1.32 {d1[]}, [r2 \align], r3 + vld1.32 {d0[]}, [r0,:32], r1 + vabdl.u8 q8, d0, d1 +.endm + +.macro SAD_4 align:vararg + vld1.32 {d1[]}, [r2 \align], r3 + vld1.32 {d0[]}, [r0,:32], r1 + vabal.u8 q8, d0, d1 +.endm + +.macro SAD_START_8 align:vararg + vld1.64 {d1}, [r2 \align], r3 + vld1.64 {d0}, [r0,:64], r1 + vabdl.u8 q8, d0, d1 +.endm + +.macro SAD_8 align:vararg + vld1.64 {d1}, [r2 \align], r3 + vld1.64 {d0}, [r0,:64], r1 + vabal.u8 q8, d0, d1 +.endm + +.macro SAD_START_16 align:vararg + vld1.64 {d2-d3}, [r2 \align], r3 + vld1.64 {d0-d1}, [r0,:128], r1 + vabdl.u8 q8, d0, d2 + vld1.64 {d6-d7}, [r2 \align], r3 + vabdl.u8 q9, d1, d3 + vld1.64 {d4-d5}, [r0,:128], r1 +.endm + +.macro SAD_16 align:vararg + vabal.u8 q8, d4, d6 + vld1.64 {d2-d3}, [r2 \align], r3 + vabal.u8 q9, d5, d7 + vld1.64 {d0-d1}, [r0,:128], r1 + vabal.u8 q8, d0, d2 + vld1.64 {d6-d7}, [r2 \align], r3 + vabal.u8 q9, d1, d3 + vld1.64 {d4-d5}, [r0,:128], r1 +.endm + +.macro SAD_FUNC w, h, name, align:vararg +function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1 +.if \w == 16 + .set r, \h / 2 - 1 +.else + .set r, \h - 1 +.endif + + SAD_START_\w \align +.rept r + SAD_\w \align +.endr + +.if \w > 8 + vabal.u8 q8, d4, d6 + vabal.u8 q9, d5, d7 + vadd.u16 q8, q8, q9 +.endif +.if \w > 4 + vadd.u16 d16, d16, d17 +.endif + vpadd.u16 d0, d16, d16 + vpaddl.u16 d0, d0 + vmov.u32 r0, d0[0] + bx lr +.endfunc +.endm + +SAD_FUNC 4, 4 +SAD_FUNC 4, 8 +SAD_FUNC 8, 4 +SAD_FUNC 8, 8 +SAD_FUNC 8, 16 +SAD_FUNC 16, 8 +SAD_FUNC 16, 16 + +SAD_FUNC 4, 4, _aligned, ,:32 +SAD_FUNC 4, 8, _aligned, ,:32 +SAD_FUNC 8, 4, _aligned, ,:64 +SAD_FUNC 8, 8, _aligned, ,:64 +SAD_FUNC 8, 16, _aligned, ,:64 +SAD_FUNC 16, 8, _aligned, ,:128 +SAD_FUNC 16, 16, _aligned, ,:128 + +// If dual issue is possible, use additional accumulators to avoid +// stalls from vadal's latency. This only matters for aligned. +.macro SAD_DUAL_START_8 + SAD_START_8 ,:64 + vld1.64 {d3}, [r2,:64], r3 + vld1.64 {d2}, [r0,:64], r1 + vabdl.u8 q9, d2, d3 +.endm + +.macro SAD_DUAL_8 align:vararg + vld1.64 {d1}, [r2,:64], r3 + vld1.64 {d0}, [r0,:64], r1 + vabal.u8 q8, d0, d1 + vld1.64 {d3}, [r2,:64], r3 + vld1.64 {d2}, [r0,:64], r1 + vabal.u8 q9, d2, d3 +.endm + +.macro SAD_DUAL_START_16 + SAD_START_16 ,:128 + vabdl.u8 q10, d4, d6 + vld1.64 {d2-d3}, [r2,:128], r3 + vabdl.u8 q11, d5, d7 + vld1.64 {d0-d1}, [r0,:128], r1 +.endm + +.macro SAD_DUAL_16 + vabal.u8 q8, d0, d2 + vld1.64 {d6-d7}, [r2,:128], r3 + vabal.u8 q9, d1, d3 + vld1.64 {d4-d5}, [r0,:128], r1 + vabal.u8 q10, d4, d6 + vld1.64 {d2-d3}, [r2,:128], r3 + vabal.u8 q11, d5, d7 + vld1.64 {d0-d1}, [r0,:128], r1 +.endm + +.macro SAD_DUAL_END_16 + vabal.u8 q8, d0, d2 + vld1.64 {d6-d7}, [r2,:128], r3 + vabal.u8 q9, d1, d3 + vld1.64 {d4-d5}, [r0,:128], r1 + vabal.u8 q10, d4, d6 + vabal.u8 q11, d5, d7 +.endm + +.macro SAD_FUNC_DUAL w, h +function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual, export=1 +.if \w == 16 + .set r, \h / 2 - 2 +.else + .set r, \h / 2 - 1 +.endif + + SAD_DUAL_START_\w +.rept \h / 2 - \w / 8 + SAD_DUAL_\w +.endr + +.if \w > 8 + SAD_DUAL_END_16 + vadd.u16 q8, q8, q9 + vadd.u16 q9, q10, q11 +.endif +.if \w > 4 + vadd.u16 q8, q8, q9 + vadd.u16 d16, d16, d17 +.endif + vpadd.u16 d0, d16, d16 + vpaddl.u16 d0, d0 + vmov.u32 r0, d0[0] + bx lr +.endfunc +.endm + +SAD_FUNC_DUAL 8, 4 +SAD_FUNC_DUAL 8, 8 +SAD_FUNC_DUAL 8, 16 +SAD_FUNC_DUAL 16, 8 +SAD_FUNC_DUAL 16, 16 + + +.macro SAD_X_START_4 x + vld1.32 {d0[]}, [r0,:32], lr + vld1.32 {d1[]}, [r1], r6 + vabdl.u8 q8, d1, d0 + vld1.32 {d2[]}, [r2], r6 + vabdl.u8 q9, d2, d0 + vld1.32 {d3[]}, [r3], r6 + vabdl.u8 q10, d3, d0 +.if \x == 4 + vld1.32 {d4[]}, [r12], r6 + vabdl.u8 q11, d4, d0 +.endif +.endm + +.macro SAD_X_4 x + vld1.32 {d0[]}, [r0,:32], lr + vld1.32 {d1[]}, [r1], r6 + vabal.u8 q8, d1, d0 + vld1.32 {d2[]}, [r2], r6 + vabal.u8 q9, d2, d0 + vld1.32 {d3[]}, [r3], r6 + vabal.u8 q10, d3, d0 +.if \x == 4 + vld1.32 {d4[]}, [r12], r6 + vabal.u8 q11, d4, d0 +.endif +.endm + +.macro SAD_X_START_8 x + vld1.64 {d0}, [r0,:64], lr + vld1.64 {d1}, [r1], r6 + vabdl.u8 q8, d1, d0 + vld1.64 {d2}, [r2], r6 + vabdl.u8 q9, d2, d0 + vld1.64 {d3}, [r3], r6 + vabdl.u8 q10, d3, d0 +.if \x == 4 + vld1.64 {d4}, [r12], r6 + vabdl.u8 q11, d4, d0 +.endif +.endm + +.macro SAD_X_8 x + vld1.64 {d0}, [r0,:64], lr + vld1.64 {d1}, [r1], r6 + vabal.u8 q8, d1, d0 + vld1.64 {d2}, [r2], r6 + vabal.u8 q9, d2, d0 + vld1.64 {d3}, [r3], r6 + vabal.u8 q10, d3, d0 +.if \x == 4 + vld1.64 {d4}, [r12], r6 + vabal.u8 q11, d4, d0 +.endif +.endm + +.macro SAD_X_START_16 x + vld1.64 {d0-d1}, [r0,:128], lr + vld1.64 {d2-d3}, [r1], r6 + vabdl.u8 q8, d2, d0 + vabdl.u8 q12, d3, d1 + vld1.64 {d4-d5}, [r2], r6 + vabdl.u8 q9, d4, d0 + vabdl.u8 q13, d5, d1 + vld1.64 {d6-d7}, [r3], r6 + vabdl.u8 q10, d6, d0 + vabdl.u8 q14, d7, d1 +.if \x == 4 + vld1.64 {d2-d3}, [r12], r6 + vabdl.u8 q11, d2, d0 + vabdl.u8 q15, d3, d1 +.endif +.endm + +.macro SAD_X_16 x + vld1.64 {d0-d1}, [r0,:128], lr + vld1.64 {d2-d3}, [r1], r6 + vabal.u8 q8, d2, d0 + vabal.u8 q12, d3, d1 + vld1.64 {d4-d5}, [r2], r6 + vabal.u8 q9, d4, d0 + vabal.u8 q13, d5, d1 + vld1.64 {d6-d7}, [r3], r6 + vabal.u8 q10, d6, d0 + vabal.u8 q14, d7, d1 +.if \x == 4 + vld1.64 {d2-d3}, [r12], r6 + vabal.u8 q11, d2, d0 + vabal.u8 q15, d3, d1 +.endif +.endm + +.macro SAD_X_FUNC x, w, h +function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 + push {r6-r7,lr} +.if \x == 3 + ldrd r6, [sp, #12] +.else + ldrd r6, [sp, #16] + ldr r12, [sp, #12] +.endif + mov lr, #FENC_STRIDE + + SAD_X_START_\w \x +.rept \h - 1 + SAD_X_\w \x +.endr + +// add up the sads +.if \w > 8 + vadd.u16 q8, q8, q12 + vadd.u16 q9, q9, q13 + vadd.u16 q10, q10, q14 +.if \x == 4 + vadd.u16 q11, q11, q15 +.endif +.endif +.if \w > 4 + vadd.u16 d16, d16, d17 + vadd.u16 d18, d18, d19 + vadd.u16 d20, d20, d21 +.if \x == 4 + vadd.u16 d22, d22, d23 +.endif +.endif + vpadd.u16 d0, d16, d18 + vpadd.u16 d1, d20, d22 + vpaddl.u16 q0, q0 + +.if \x == 3 + vst1.32 {d0}, [r7]! + vst1.32 {d1[0]}, [r7,:32] +.else + vst1.32 {d0-d1}, [r7] +.endif + pop {r6-r7,pc} +.endfunc +.endm + +SAD_X_FUNC 3, 4, 4 +SAD_X_FUNC 3, 4, 8 +SAD_X_FUNC 3, 8, 4 +SAD_X_FUNC 3, 8, 8 +SAD_X_FUNC 3, 8, 16 +SAD_X_FUNC 3, 16, 8 +SAD_X_FUNC 3, 16, 16 + +SAD_X_FUNC 4, 4, 4 +SAD_X_FUNC 4, 4, 8 +SAD_X_FUNC 4, 8, 4 +SAD_X_FUNC 4, 8, 8 +SAD_X_FUNC 4, 8, 16 +SAD_X_FUNC 4, 16, 8 +SAD_X_FUNC 4, 16, 16 + + +.macro SSD_START_4 + vld1.32 {d16[]}, [r0,:32], r1 + vld1.32 {d17[]}, [r2,:32], r3 + vsubl.u8 q2, d16, d17 + vld1.32 {d16[]}, [r0,:32], r1 + vmull.s16 q0, d4, d4 + vld1.32 {d17[]}, [r2,:32], r3 +.endm + +.macro SSD_4 + vsubl.u8 q2, d16, d17 + vld1.32 {d16[]}, [r0,:32], r1 + vmlal.s16 q0, d4, d4 + vld1.32 {d17[]}, [r2,:32], r3 +.endm + +.macro SSD_END_4 + vsubl.u8 q2, d16, d17 + vmlal.s16 q0, d4, d4 +.endm + +.macro SSD_START_8 + vld1.64 {d16}, [r0,:64], r1 + vld1.64 {d17}, [r2,:64], r3 + vsubl.u8 q2, d16, d17 + vld1.64 {d16}, [r0,:64], r1 + vmull.s16 q0, d4, d4 + vmlal.s16 q0, d5, d5 + vld1.64 {d17}, [r2,:64], r3 +.endm + +.macro SSD_8 + vsubl.u8 q2, d16, d17 + vld1.64 {d16}, [r0,:64], r1 + vmlal.s16 q0, d4, d4 + vmlal.s16 q0, d5, d5 + vld1.64 {d17}, [r2,:64], r3 +.endm + +.macro SSD_END_8 + vsubl.u8 q2, d16, d17 + vmlal.s16 q0, d4, d4 + vmlal.s16 q0, d5, d5 +.endm + +.macro SSD_START_16 + vld1.64 {d16-d17}, [r0,:128], r1 + vld1.64 {d18-d19}, [r2,:128], r3 + vsubl.u8 q2, d16, d18 + vsubl.u8 q3, d17, d19 + vld1.64 {d16-d17}, [r0,:128], r1 + vmull.s16 q0, d4, d4 + vmlal.s16 q0, d5, d5 + vld1.64 {d18-d19}, [r2,:128], r3 + vmlal.s16 q0, d6, d6 + vmlal.s16 q0, d7, d7 +.endm + +.macro SSD_16 + vsubl.u8 q2, d16, d18 + vsubl.u8 q3, d17, d19 + vld1.64 {d16-d17}, [r0,:128], r1 + vmlal.s16 q0, d4, d4 + vmlal.s16 q0, d5, d5 + vld1.64 {d18-d19}, [r2,:128], r3 + vmlal.s16 q0, d6, d6 + vmlal.s16 q0, d7, d7 +.endm + +.macro SSD_END_16 + vsubl.u8 q2, d16, d18 + vsubl.u8 q3, d17, d19 + vmlal.s16 q0, d4, d4 + vmlal.s16 q0, d5, d5 + vmlal.s16 q0, d6, d6 + vmlal.s16 q0, d7, d7 +.endm + +.macro SSD_FUNC w h +function x264_pixel_ssd_\w\()x\h\()_neon, export=1 + SSD_START_\w +.rept \h-2 + SSD_\w +.endr + SSD_END_\w + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +.endfunc +.endm + +SSD_FUNC 4, 4 +SSD_FUNC 4, 8 +SSD_FUNC 8, 4 +SSD_FUNC 8, 8 +SSD_FUNC 8, 16 +SSD_FUNC 16, 8 +SSD_FUNC 16, 16 + + +.macro VAR_SQR_SUM qsqr_sum qsqr_last qsqr dsrc vpadal=vpadal.u16 + vmull.u8 \qsqr, \dsrc, \dsrc + vaddw.u8 q0, q0, \dsrc + \vpadal \qsqr_sum, \qsqr_last +.endm + +function x264_pixel_var_8x8_neon, export=1 + vld1.64 {d16}, [r0,:64], r1 + vmull.u8 q1, d16, d16 + vmovl.u8 q0, d16 + vld1.64 {d18}, [r0,:64], r1 + vmull.u8 q2, d18, d18 + vaddw.u8 q0, q0, d18 + + vld1.64 {d20}, [r0,:64], r1 + VAR_SQR_SUM q1, q1, q3, d20, vpaddl.u16 + vld1.64 {d22}, [r0,:64], r1 + VAR_SQR_SUM q2, q2, q8, d22, vpaddl.u16 + + vld1.64 {d24}, [r0,:64], r1 + VAR_SQR_SUM q1, q3, q9, d24 + vld1.64 {d26}, [r0,:64], r1 + VAR_SQR_SUM q2, q8, q10, d26 + vld1.64 {d24}, [r0,:64], r1 + VAR_SQR_SUM q1, q9, q14, d24 + vld1.64 {d26}, [r0,:64], r1 + VAR_SQR_SUM q2, q10, q15, d26 + + mov r2, #6 + b x264_var_end +.endfunc + +function x264_pixel_var_16x16_neon, export=1 + vld1.64 {d16-d17}, [r0,:128], r1 + vmull.u8 q12, d16, d16 + vmovl.u8 q0, d16 + vmull.u8 q13, d17, d17 + vaddw.u8 q0, q0, d17 + + vld1.64 {d18-d19}, [r0,:128], r1 + VAR_SQR_SUM q1, q12, q14, d18, vpaddl.u16 + VAR_SQR_SUM q2, q13, q15, d19, vpaddl.u16 + + mov ip, #7 + mov r2, #8 +var16_loop: + subs ip, ip, #1 + vld1.64 {d16-d17}, [r0,:128], r1 + VAR_SQR_SUM q1, q14, q12, d16 + VAR_SQR_SUM q2, q15, q13, d17 + + vld1.64 {d18-d19}, [r0,:128], r1 + VAR_SQR_SUM q1, q12, q14, d18 + VAR_SQR_SUM q2, q13, q15, d19 + bgt var16_loop +.endfunc + +function x264_var_end + vpaddl.u16 q8, q14 + vpaddl.u16 q9, q15 + vadd.u32 q1, q1, q8 + vadd.u16 d0, d0, d1 + vadd.u32 q1, q1, q9 + vadd.u32 q1, q1, q2 + vpaddl.u16 d0, d0 + vadd.u32 d2, d2, d3 + vpadd.u32 d0, d0, d2 + + vmov r0, r1, d0 + mul r0, r0, r0 + sub r0, r1, r0, lsr r2 + bx lr +.endfunc + +.macro DIFF_SUM diff da db lastdiff + vld1.64 {\da}, [r0,:64], r1 + vld1.64 {\db}, [r2,:64], r3 +.ifnb \lastdiff + vadd.s16 q0, q0, \lastdiff +.endif + vsubl.u8 \diff, \da, \db +.endm + +.macro SQR_ACC acc d0 d1 vmlal=vmlal.s16 + \vmlal \acc, \d0, \d0 + vmlal.s16 \acc, \d1, \d1 +.endm + +function x264_pixel_var2_8x8_neon, export=1 + DIFF_SUM q0, d0, d1 + DIFF_SUM q8, d16, d17 + SQR_ACC q1, d0, d1, vmull.s16 + DIFF_SUM q9, d18, d19, q8 + SQR_ACC q2, d16, d17, vmull.s16 +.rept 2 + DIFF_SUM q8, d16, d17, q9 + SQR_ACC q1, d18, d19 + DIFF_SUM q9, d18, d19, q8 + SQR_ACC q2, d16, d17 +.endr + DIFF_SUM q8, d16, d17, q9 + SQR_ACC q1, d18, d19 + vadd.s16 q0, q0, q8 + SQR_ACC q2, d16, d17 + + ldr ip, [sp] + vadd.s16 d0, d0, d1 + vadd.s32 q1, q1, q2 + vpaddl.s16 d0, d0 + vadd.s32 d1, d2, d3 + vpadd.s32 d0, d0, d1 + + vmov.32 r0, r1, d0 + vst1.32 {d0[1]}, [ip,:32] + mul r0, r0, r0 + sub r0, r1, r0, lsr #6 + bx lr +.endfunc + + +.macro LOAD_DIFF_8x4 q0 q1 q2 q3 + vld1.32 {d1}, [r2], r3 + vld1.32 {d0}, [r0,:64], r1 + vsubl.u8 \q0, d0, d1 + vld1.32 {d3}, [r2], r3 + vld1.32 {d2}, [r0,:64], r1 + vsubl.u8 \q1, d2, d3 + vld1.32 {d5}, [r2], r3 + vld1.32 {d4}, [r0,:64], r1 + vsubl.u8 \q2, d4, d5 + vld1.32 {d7}, [r2], r3 + vld1.32 {d6}, [r0,:64], r1 + vsubl.u8 \q3, d6, d7 +.endm + +function x264_pixel_satd_4x4_neon, export=1 + vld1.32 {d1[]}, [r2], r3 + vld1.32 {d0[]}, [r0,:32], r1 + vld1.32 {d3[]}, [r2], r3 + vld1.32 {d2[]}, [r0,:32], r1 + vld1.32 {d1[1]}, [r2], r3 + vld1.32 {d0[1]}, [r0,:32], r1 + vld1.32 {d3[1]}, [r2], r3 + vld1.32 {d2[1]}, [r0,:32], r1 + vsubl.u8 q0, d0, d1 + vsubl.u8 q1, d2, d3 + + SUMSUB_AB q2, q3, q0, q1 + SUMSUB_ABCD d0, d2, d1, d3, d4, d5, d6, d7 + HADAMARD 1, sumsub, q2, q3, q0, q1 + HADAMARD 2, amax, q0,, q2, q3 + + HORIZ_ADD d0, d0, d1 + vmov.32 r0, d0[0] + bx lr +.endfunc + +function x264_pixel_satd_4x8_neon, export=1 + vld1.32 {d1[]}, [r2], r3 + vld1.32 {d0[]}, [r0,:32], r1 + vld1.32 {d3[]}, [r2], r3 + vld1.32 {d2[]}, [r0,:32], r1 + vld1.32 {d5[]}, [r2], r3 + vld1.32 {d4[]}, [r0,:32], r1 + vld1.32 {d7[]}, [r2], r3 + vld1.32 {d6[]}, [r0,:32], r1 + + vld1.32 {d1[1]}, [r2], r3 + vld1.32 {d0[1]}, [r0,:32], r1 + vsubl.u8 q0, d0, d1 + vld1.32 {d3[1]}, [r2], r3 + vld1.32 {d2[1]}, [r0,:32], r1 + vsubl.u8 q1, d2, d3 + vld1.32 {d5[1]}, [r2], r3 + vld1.32 {d4[1]}, [r0,:32], r1 + vsubl.u8 q2, d4, d5 + vld1.32 {d7[1]}, [r2], r3 + SUMSUB_AB q8, q9, q0, q1 + vld1.32 {d6[1]}, [r0,:32], r1 + vsubl.u8 q3, d6, d7 + SUMSUB_AB q10, q11, q2, q3 + b x264_satd_4x8_8x4_end_neon +.endfunc + +function x264_pixel_satd_8x4_neon, export=1 + vld1.64 {d1}, [r2], r3 + vld1.64 {d0}, [r0,:64], r1 + vsubl.u8 q0, d0, d1 + vld1.64 {d3}, [r2], r3 + vld1.64 {d2}, [r0,:64], r1 + vsubl.u8 q1, d2, d3 + vld1.64 {d5}, [r2], r3 + vld1.64 {d4}, [r0,:64], r1 + vsubl.u8 q2, d4, d5 + vld1.64 {d7}, [r2], r3 + SUMSUB_AB q8, q9, q0, q1 + vld1.64 {d6}, [r0,:64], r1 + vsubl.u8 q3, d6, d7 + SUMSUB_AB q10, q11, q2, q3 +.endfunc + +function x264_satd_4x8_8x4_end_neon + vadd.s16 q0, q8, q10 + vadd.s16 q1, q9, q11 + vsub.s16 q2, q8, q10 + vsub.s16 q3, q9, q11 + + vtrn.16 q0, q1 + vadd.s16 q8, q0, q1 + vtrn.16 q2, q3 + vsub.s16 q9, q0, q1 + vadd.s16 q10, q2, q3 + vsub.s16 q11, q2, q3 + vtrn.32 q8, q10 + vabs.s16 q8, q8 + vtrn.32 q9, q11 + vabs.s16 q10, q10 + vabs.s16 q9, q9 + vabs.s16 q11, q11 + vmax.u16 q0, q8, q10 + vmax.u16 q1, q9, q11 + + vadd.u16 q0, q0, q1 + HORIZ_ADD d0, d0, d1 + vmov.32 r0, d0[0] + bx lr +.endfunc + +function x264_pixel_satd_8x8_neon, export=1 + mov ip, lr + + bl x264_satd_8x8_neon + vadd.u16 q0, q12, q13 + vadd.u16 q1, q14, q15 + + vadd.u16 q0, q0, q1 + HORIZ_ADD d0, d0, d1 + mov lr, ip + vmov.32 r0, d0[0] + bx lr +.endfunc + +function x264_pixel_satd_8x16_neon, export=1 + vpush {d8-d11} + mov ip, lr + + bl x264_satd_8x8_neon + vadd.u16 q4, q12, q13 + vadd.u16 q5, q14, q15 + + bl x264_satd_8x8_neon + vadd.u16 q4, q4, q12 + vadd.u16 q5, q5, q13 + vadd.u16 q4, q4, q14 + vadd.u16 q5, q5, q15 + + vadd.u16 q0, q4, q5 + HORIZ_ADD d0, d0, d1 + vpop {d8-d11} + mov lr, ip + vmov.32 r0, d0[0] + bx lr +.endfunc + +function x264_satd_8x8_neon + LOAD_DIFF_8x4 q8, q9, q10, q11 + vld1.64 {d7}, [r2], r3 + SUMSUB_AB q0, q1, q8, q9 + vld1.64 {d6}, [r0,:64], r1 + vsubl.u8 q12, d6, d7 + vld1.64 {d17}, [r2], r3 + SUMSUB_AB q2, q3, q10, q11 + vld1.64 {d16}, [r0,:64], r1 + vsubl.u8 q13, d16, d17 + vld1.64 {d19}, [r2], r3 + SUMSUB_AB q8, q10, q0, q2 + vld1.64 {d18}, [r0,:64], r1 + vsubl.u8 q14, d18, d19 + vld1.64 {d1}, [r2], r3 + SUMSUB_AB q9, q11, q1, q3 + vld1.64 {d0}, [r0,:64], r1 + vsubl.u8 q15, d0, d1 +.endfunc + +// one vertical hadamard pass and two horizontal +function x264_satd_8x4v_8x8h_neon + SUMSUB_ABCD q0, q1, q2, q3, q12, q13, q14, q15 + vtrn.16 q8, q9 + SUMSUB_AB q12, q14, q0, q2 + vtrn.16 q10, q11 + SUMSUB_AB q13, q15, q1, q3 + SUMSUB_AB q0, q1, q8, q9 + vtrn.16 q12, q13 + SUMSUB_AB q2, q3, q10, q11 + vtrn.16 q14, q15 + SUMSUB_AB q8, q9, q12, q13 + vtrn.32 q0, q2 + SUMSUB_AB q10, q11, q14, q15 + + vtrn.32 q1, q3 + ABS2 q0, q2 + vtrn.32 q8, q10 + ABS2 q1, q3 + vtrn.32 q9, q11 + ABS2 q8, q10 + ABS2 q9, q11 + vmax.s16 q12, q0, q2 + vmax.s16 q13, q1, q3 + vmax.s16 q14, q8, q10 + vmax.s16 q15, q9, q11 + bx lr +.endfunc + +function x264_pixel_satd_16x8_neon, export=1 + vpush {d8-d11} + mov ip, lr + + bl x264_satd_16x4_neon + vadd.u16 q4, q12, q13 + vadd.u16 q5, q14, q15 + + bl x264_satd_16x4_neon + vadd.u16 q4, q4, q12 + vadd.u16 q5, q5, q13 + vadd.u16 q4, q4, q14 + vadd.u16 q5, q5, q15 + + vadd.u16 q0, q4, q5 + HORIZ_ADD d0, d0, d1 + vpop {d8-d11} + mov lr, ip + vmov.32 r0, d0[0] + bx lr +.endfunc + +function x264_pixel_satd_16x16_neon, export=1 + vpush {d8-d11} + mov ip, lr + + bl x264_satd_16x4_neon + vadd.u16 q4, q12, q13 + vadd.u16 q5, q14, q15 + + bl x264_satd_16x4_neon + vadd.u16 q4, q4, q12 + vadd.u16 q5, q5, q13 + vadd.u16 q4, q4, q14 + vadd.u16 q5, q5, q15 + + bl x264_satd_16x4_neon + vadd.u16 q4, q4, q12 + vadd.u16 q5, q5, q13 + vadd.u16 q4, q4, q14 + vadd.u16 q5, q5, q15 + + bl x264_satd_16x4_neon + vadd.u16 q4, q4, q12 + vadd.u16 q5, q5, q13 + vadd.u16 q4, q4, q14 + vadd.u16 q5, q5, q15 + + vadd.u16 q0, q4, q5 + HORIZ_ADD d0, d0, d1 + vpop {d8-d11} + mov lr, ip + vmov.32 r0, d0[0] + bx lr +.endfunc + +function x264_satd_16x4_neon + vld1.64 {d2-d3}, [r2], r3 + vld1.64 {d0-d1}, [r0,:128], r1 + vsubl.u8 q8, d0, d2 + vld1.64 {d6-d7}, [r2], r3 + vsubl.u8 q12, d1, d3 + vld1.64 {d4-d5}, [r0,:128], r1 + vsubl.u8 q9, d4, d6 + vld1.64 {d2-d3}, [r2], r3 + vsubl.u8 q13, d5, d7 + vld1.64 {d0-d1}, [r0,:128], r1 + vsubl.u8 q10, d0, d2 + vld1.64 {d6-d7}, [r2], r3 + vsubl.u8 q14, d1, d3 + vadd.s16 q0, q8, q9 + vld1.64 {d4-d5}, [r0,:128], r1 + vsub.s16 q1, q8, q9 + vsubl.u8 q11, d4, d6 + vsubl.u8 q15, d5, d7 + SUMSUB_AB q2, q3, q10, q11 + SUMSUB_ABCD q8, q10, q9, q11, q0, q2, q1, q3 + b x264_satd_8x4v_8x8h_neon +.endfunc + + +function x264_pixel_sa8d_8x8_neon, export=1 + mov ip, lr + bl x264_sa8d_8x8_neon + vadd.u16 q0, q8, q9 + HORIZ_ADD d0, d0, d1 + mov lr, ip + vmov.32 r0, d0[0] + add r0, r0, #1 + lsr r0, r0, #1 + bx lr +.endfunc + +function x264_pixel_sa8d_16x16_neon, export=1 + vpush {d8-d11} + mov ip, lr + bl x264_sa8d_8x8_neon + vpaddl.u16 q4, q8 + vpaddl.u16 q5, q9 + bl x264_sa8d_8x8_neon + vpadal.u16 q4, q8 + vpadal.u16 q5, q9 + sub r0, r0, r1, lsl #4 + sub r2, r2, r3, lsl #4 + add r0, r0, #8 + add r2, r2, #8 + bl x264_sa8d_8x8_neon + vpadal.u16 q4, q8 + vpadal.u16 q5, q9 + bl x264_sa8d_8x8_neon + vpaddl.u16 q8, q8 + vpaddl.u16 q9, q9 + vadd.u32 q0, q4, q8 + vadd.u32 q1, q5, q9 + vadd.u32 q0, q0, q1 + vadd.u32 d0, d0, d1 + vpadd.u32 d0, d0, d0 + vpop {d8-d11} + mov lr, ip + vmov.32 r0, d0[0] + add r0, r0, #1 + lsr r0, r0, #1 + bx lr +.endfunc + +.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4 + SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4 + SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4 +.endm + +function x264_sa8d_8x8_neon + LOAD_DIFF_8x4 q8, q9, q10, q11 + vld1.64 {d7}, [r2], r3 + SUMSUB_AB q0, q1, q8, q9 + vld1.64 {d6}, [r0,:64], r1 + vsubl.u8 q12, d6, d7 + vld1.64 {d17}, [r2], r3 + SUMSUB_AB q2, q3, q10, q11 + vld1.64 {d16}, [r0,:64], r1 + vsubl.u8 q13, d16, d17 + vld1.64 {d19}, [r2], r3 + SUMSUB_AB q8, q10, q0, q2 + vld1.64 {d18}, [r0,:64], r1 + vsubl.u8 q14, d18, d19 + vld1.64 {d1}, [r2], r3 + SUMSUB_AB q9, q11, q1, q3 + vld1.64 {d0}, [r0,:64], r1 + vsubl.u8 q15, d0, d1 + + HADAMARD4_V q12, q13, q14, q15, q0, q1, q2, q3 + SUMSUB_ABCD q0, q8, q1, q9, q8, q12, q9, q13 + SUMSUB_AB q2, q10, q10, q14 + vtrn.16 q8, q9 + SUMSUB_AB q3, q11, q11, q15 + vtrn.16 q0, q1 + SUMSUB_AB q12, q13, q8, q9 + vtrn.16 q10, q11 + SUMSUB_AB q8, q9, q0, q1 + vtrn.16 q2, q3 + SUMSUB_AB q14, q15, q10, q11 + vadd.i16 q10, q2, q3 + vtrn.32 q12, q14 + vsub.i16 q11, q2, q3 + vtrn.32 q13, q15 + SUMSUB_AB q0, q2, q12, q14 + vtrn.32 q8, q10 + SUMSUB_AB q1, q3, q13, q15 + vtrn.32 q9, q11 + SUMSUB_AB q12, q14, q8, q10 + SUMSUB_AB q13, q15, q9, q11 + + vswp d1, d24 + ABS2 q0, q12 + vswp d3, d26 + ABS2 q1, q13 + vswp d5, d28 + ABS2 q2, q14 + vswp d7, d30 + ABS2 q3, q15 + vmax.s16 q8, q0, q12 + vmax.s16 q9, q1, q13 + vmax.s16 q10, q2, q14 + vmax.s16 q11, q3, q15 + vadd.i16 q8, q8, q9 + vadd.i16 q9, q10, q11 + bx lr +.endfunc + + +.macro HADAMARD_AC w h +function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1 + vpush {d8-d15} + movrel ip, mask_ac4 + vmov.i8 q4, #0 + // note: this assumes mask_ac8 is after mask_ac4 (so don't move it) + vld1.64 {d12-d15}, [ip,:128] + vmov.i8 q5, #0 + + mov ip, lr + bl x264_hadamard_ac_8x8_neon +.if \h > 8 + bl x264_hadamard_ac_8x8_neon +.endif +.if \w > 8 + sub r0, r0, r1, lsl #3 + add r0, r0, #8 + bl x264_hadamard_ac_8x8_neon +.endif +.if \w * \h == 256 + sub r0, r0, r1, lsl #4 + bl x264_hadamard_ac_8x8_neon +.endif + + vadd.s32 d8, d8, d9 + vadd.s32 d10, d10, d11 + vpadd.s32 d0, d8, d10 + vpop {d8-d15} + mov lr, ip + vmov r0, r1, d0 + lsr r0, r0, #1 + lsr r1, r1, #2 + bx lr +.endfunc +.endm + +HADAMARD_AC 8, 8 +HADAMARD_AC 8, 16 +HADAMARD_AC 16, 8 +HADAMARD_AC 16, 16 + +// q4: satd q5: sa8d q6: mask_ac4 q7: mask_ac8 +function x264_hadamard_ac_8x8_neon + vld1.64 {d2}, [r0,:64], r1 + vld1.64 {d3}, [r0,:64], r1 + vaddl.u8 q0, d2, d3 + vld1.64 {d6}, [r0,:64], r1 + vsubl.u8 q1, d2, d3 + vld1.64 {d7}, [r0,:64], r1 + vaddl.u8 q2, d6, d7 + vld1.64 {d18}, [r0,:64], r1 + vsubl.u8 q3, d6, d7 + vld1.64 {d19}, [r0,:64], r1 + vaddl.u8 q8, d18, d19 + vld1.64 {d22}, [r0,:64], r1 + vsubl.u8 q9, d18, d19 + vld1.64 {d23}, [r0,:64], r1 + + SUMSUB_ABCD q12, q14, q13, q15, q0, q2, q1, q3 + vaddl.u8 q10, d22, d23 + vsubl.u8 q11, d22, d23 + vtrn.16 q12, q13 + SUMSUB_ABCD q0, q2, q1, q3, q8, q10, q9, q11 + + vtrn.16 q14, q15 + SUMSUB_AB q8, q9, q12, q13 + vtrn.16 q0, q1 + SUMSUB_AB q10, q11, q14, q15 + vtrn.16 q2, q3 + SUMSUB_AB q12, q13, q0, q1 + vtrn.32 q8, q10 + SUMSUB_AB q14, q15, q2, q3 + vtrn.32 q9, q11 + SUMSUB_AB q0, q2, q8, q10 + vtrn.32 q12, q14 + SUMSUB_AB q1, q3, q9, q11 + vtrn.32 q13, q15 + SUMSUB_ABCD q8, q10, q9, q11, q12, q14, q13, q15 + + vabs.s16 q12, q0 + vabs.s16 q13, q8 + vabs.s16 q15, q1 + vadd.s16 q12, q12, q13 + vabs.s16 q14, q2 + vand.s16 q12, q12, q6 + vabs.s16 q13, q3 + vadd.s16 q12, q12, q15 + vabs.s16 q15, q9 + vadd.s16 q12, q12, q14 + vabs.s16 q14, q10 + vadd.s16 q12, q12, q13 + vabs.s16 q13, q11 + vadd.s16 q12, q12, q15 + vsub.s16 q15, q11, q3 + vadd.s16 q12, q12, q14 + vadd.s16 q14, q11, q3 + vadd.s16 q12, q12, q13 + vsub.s16 q13, q10, q2 + vadd.s16 q2, q10, q2 + vpadal.u16 q4, q12 + + SUMSUB_AB q10, q11, q9, q1 + SUMSUB_AB q9, q8, q0, q8 + vswp d29, d30 + vabs.s16 q14, q14 + vabs.s16 q15, q15 + vswp d5, d26 + vabs.s16 q2, q2 + vabs.s16 q13, q13 + vswp d21, d22 + vabs.s16 q10, q10 + vabs.s16 q11, q11 + vmax.s16 q3, q14, q15 + vmax.s16 q2, q2, q13 + vmax.s16 q1, q10, q11 + vswp d19, d16 + SUMSUB_AB q14, q15, q9, q8 + + vadd.s16 q2, q2, q3 + vadd.s16 q2, q2, q1 + vand q14, q14, q7 + vadd.s16 q2, q2, q2 + vabs.s16 q15, q15 + vabs.s16 q14, q14 + vadd.s16 q2, q2, q15 + vadd.s16 q2, q2, q14 + vpadal.u16 q5, q2 + bx lr +.endfunc + + +.macro SSIM_ITER n ssa s12 ssb lastssa lasts12 lastssb da db dnext + vld1.64 {\db}, [r2], r3 + vmull.u8 \ssa, \da, \da + vmull.u8 \s12, \da, \db +.if \n == 1 + vpaddl.u16 q2, \lastssa + vpaddl.u16 q3, \lasts12 + vaddl.u8 q0, d0, \da +.else + vpadal.u16 q2, \lastssa + vpadal.u16 q3, \lasts12 + vaddw.u8 q0, q0, \da +.endif + vpadal.u16 q2, \lastssb +.if \n < 3 + vld1.64 {\dnext}, [r0], r1 +.endif +.if \n == 1 + vaddl.u8 q1, d2, \db +.else + vaddw.u8 q1, q1, \db +.endif + vmull.u8 \ssb, \db, \db +.endm + +function x264_pixel_ssim_4x4x2_core_neon, export=1 + ldr ip, [sp] + vld1.64 {d0}, [r0], r1 + vld1.64 {d2}, [r2], r3 + vmull.u8 q2, d0, d0 + vmull.u8 q3, d0, d2 + vld1.64 {d28}, [r0], r1 + vmull.u8 q15, d2, d2 + + SSIM_ITER 1, q8, q9, q14, q2, q3, q15, d28, d29, d26 + SSIM_ITER 2, q10,q11,q13, q8, q9, q14, d26, d27, d28 + SSIM_ITER 3, q8, q9, q15, q10,q11,q13, d28, d29 + + vpadal.u16 q2, q8 + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 + vpadal.u16 q2, q15 + vpadal.u16 q3, q9 + + vpadd.u32 d0, d0, d1 + vpadd.u32 d1, d2, d3 + vpadd.u32 d2, d4, d5 + vpadd.u32 d3, d6, d7 + + vst4.32 {d0-d3}, [ip] + bx lr +.endfunc + +// FIXME: see about doing 16x16 -> 32 bit multiplies for s1/s2 +function x264_pixel_ssim_end4_neon, export=1 + vld1.32 {d16-d19}, [r0,:128]! + vld1.32 {d20-d23}, [r1,:128]! + vadd.s32 q0, q8, q10 + vadd.s32 q1, q9, q11 + vld1.32 {d24-d27}, [r0,:128]! + vadd.s32 q0, q0, q1 + vld1.32 {d28-d31}, [r1,:128]! + vadd.s32 q2, q12, q14 + vadd.s32 q3, q13, q15 + vld1.32 {d16-d17}, [r0,:128] + vadd.s32 q1, q1, q2 + vld1.32 {d18-d19}, [r1,:128] + vadd.s32 q8, q8, q9 + vadd.s32 q2, q2, q3 + vadd.s32 q3, q3, q8 + + vtrn.32 q0, q1 + vtrn.32 q2, q3 + vswp d1, d4 + vswp d3, d6 + +// s1=q0, s2=q1, ss=q2, s12=q3 + vmul.s32 q8, q0, q1 // s1*s2 + vmul.s32 q0, q0, q0 + vmla.s32 q0, q1, q1 // s1*s1 + s2*s2 + + vshl.s32 q3, q3, #7 + vshl.s32 q2, q2, #6 + vadd.s32 q1, q8, q8 + + mov r3, #416 // ssim_c1= .01*.01*255*255*64 + movw ip, #39355 // ssim_c2= .03*.03*255*255*64*63 - 3<<16 + movt ip, #3 + vdup.32 q14, r3 + vdup.32 q15, ip + + vsub.s32 q2, q2, q0 // vars + vsub.s32 q3, q3, q1 // covar*2 + vadd.s32 q0, q0, q14 + vadd.s32 q2, q2, q15 + vadd.s32 q1, q1, q14 + vadd.s32 q3, q3, q15 + + vcvt.f32.s32 q0, q0 + vcvt.f32.s32 q2, q2 + vcvt.f32.s32 q1, q1 + vcvt.f32.s32 q3, q3 + + vmul.f32 q0, q0, q2 + vmul.f32 q1, q1, q3 + + cmp r2, #4 + + vdiv.f32 s0, s4, s0 + vdiv.f32 s1, s5, s1 + vdiv.f32 s2, s6, s2 + vdiv.f32 s3, s7, s3 + + beq ssim_skip + movrel r3, mask_ff + sub r3, r3, r2, lsl #2 + vld1.64 {d6-d7}, [r3] + vand q0, q0, q3 +ssim_skip: + vadd.f32 d0, d0, d1 + vpadd.f32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +.endfunc diff --git a/common/arm/pixel.h b/common/arm/pixel.h new file mode 100644 index 00000000..2ef5cea4 --- /dev/null +++ b/common/arm/pixel.h @@ -0,0 +1,69 @@ +/***************************************************************************** + * pixel.h: h264 encoder library + ***************************************************************************** + * Copyright (C) 2009 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + *****************************************************************************/ + +#ifndef X264_ARM_PIXEL_H +#define X264_ARM_PIXEL_H + +#define DECL_PIXELS( ret, name, suffix, args ) \ + ret x264_pixel_##name##_16x16_##suffix args;\ + ret x264_pixel_##name##_16x8_##suffix args;\ + ret x264_pixel_##name##_8x16_##suffix args;\ + ret x264_pixel_##name##_8x8_##suffix args;\ + ret x264_pixel_##name##_8x4_##suffix args;\ + ret x264_pixel_##name##_4x8_##suffix args;\ + ret x264_pixel_##name##_4x4_##suffix args;\ + +#define DECL_X1( name, suffix ) \ + DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) ) + +#define DECL_X4( name, suffix ) \ + DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )\ + DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) ) + +int x264_pixel_sad_4x4_armv6( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sad_4x8_armv6( uint8_t *, int, uint8_t *, int ); + +DECL_X1( sad, neon ) +DECL_X1( sad_aligned, neon ) +DECL_X1( sad_aligned, neon_dual ) +DECL_X4( sad, neon ) +DECL_X1( satd, neon ) +DECL_X1( ssd, neon ) + +int x264_pixel_sa8d_8x8_neon( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sa8d_16x16_neon( uint8_t *, int, uint8_t *, int ); + +int x264_pixel_var_8x8_neon( uint8_t *, int ); +int x264_pixel_var_16x16_neon( uint8_t *, int ); +int x264_pixel_var2_8x8_neon( uint8_t *, int, uint8_t *, int, int * ); + +uint64_t x264_pixel_hadamard_ac_8x8_neon( uint8_t *, int ); +uint64_t x264_pixel_hadamard_ac_8x16_neon( uint8_t *, int ); +uint64_t x264_pixel_hadamard_ac_16x8_neon( uint8_t *, int ); +uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, int ); + +void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, int, + const uint8_t *, int, + int sums[2][4]); +float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width ); + +#endif diff --git a/common/pixel.c b/common/pixel.c index 852748ec..292cdf57 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -29,6 +29,9 @@ #ifdef ARCH_PPC # include "ppc/pixel.h" #endif +#ifdef ARCH_ARM +# include "arm/pixel.h" +#endif #ifdef ARCH_UltraSparc # include "sparc/pixel.h" #endif @@ -453,6 +456,10 @@ SATD_X_DECL7( _ssse3 ) SATD_X_DECL7( _sse4 ) #endif +#ifdef HAVE_ARMV6 +SATD_X_DECL7( _neon ) +#endif + /**************************************************************************** * structural similarity metric ****************************************************************************/ @@ -815,6 +822,47 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) } #endif //HAVE_MMX +#ifdef HAVE_ARMV6 + if( cpu&X264_CPU_ARMV6 ) + { + pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_armv6; + pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_armv6; + pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_armv6; + pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_armv6; + } + if( cpu&X264_CPU_NEON ) + { + INIT5( sad, _neon ); + INIT5( sad_aligned, _neon ); + INIT7( sad_x3, _neon ); + INIT7( sad_x4, _neon ); + INIT7( ssd, _neon ); + INIT7( satd, _neon ); + INIT7( satd_x3, _neon ); + INIT7( satd_x4, _neon ); + INIT4( hadamard_ac, _neon ); + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; + pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; + pixf->var2_8x8 = x264_pixel_var2_8x8_neon; + + pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; + pixf->ssim_end4 = x264_pixel_ssim_end4_neon; + + if( cpu&X264_CPU_FAST_NEON_MRC ) + { + pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_neon; + pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_neon; + pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_aligned_4x8_neon; + pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_aligned_4x4_neon; + } + else // really just scheduled for dual issue / A8 + { + INIT5( sad_aligned, _neon_dual ); + } + } +#endif #ifdef ARCH_PPC if( cpu&X264_CPU_ALTIVEC ) {