From 4ea9cf3e2da3b6026adc17320e3b550637aeec3c Mon Sep 17 00:00:00 2001 From: James Yu Date: Wed, 18 Dec 2013 18:47:48 +0800 Subject: [PATCH] VP8 for ARMv8 by using NEON intrinsics 16 Add variance_neon.c - vp8_variance16x16_neon - vp8_variance16x8_neon - vp8_variance8x16_neon - vp8_variance8x8_neon Change-Id: Idfb9c96134a1c6a696a98ce68b4f7ed593a00660 Signed-off-by: James Yu --- vp8/common/arm/neon/variance_neon.asm | 291 ----------------------- vp8/common/arm/neon/variance_neon.c | 319 ++++++++++++++++++++++++++ vp8/vp8_common.mk | 3 +- 3 files changed, 320 insertions(+), 293 deletions(-) delete mode 100644 vp8/common/arm/neon/variance_neon.asm create mode 100644 vp8/common/arm/neon/variance_neon.c diff --git a/vp8/common/arm/neon/variance_neon.asm b/vp8/common/arm/neon/variance_neon.asm deleted file mode 100644 index 8ecad72b9..000000000 --- a/vp8/common/arm/neon/variance_neon.asm +++ /dev/null @@ -1,291 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_variance16x16_neon| - EXPORT |vp8_variance16x8_neon| - EXPORT |vp8_variance8x16_neon| - EXPORT |vp8_variance8x8_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance16x16_neon| PROC - vpush {q5} - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - mov r12, #8 - -variance16x16_neon_loop - vld1.8 {q0}, [r0], r1 ;Load up source and reference - vld1.8 {q2}, [r2], r3 - vld1.8 {q1}, [r0], r1 - vld1.8 {q3}, [r2], r3 - - vsubl.u8 q11, d0, d4 ;calculate diff - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - ;VPADAL adds adjacent pairs of elements of a vector, and accumulates - ;the results into the elements of the destination vector. The explanation - ;in ARM guide is wrong. - vpadal.s16 q8, q11 ;calculate sum - vmlal.s16 q9, d22, d22 ;calculate sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne variance16x16_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - ldr r12, [sp, #16] ;load *sse from stack - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - ;vmov.32 r0, d0[0] ;this instruction costs a lot - ;vmov.32 r1, d1[0] - ;mul r0, r0, r0 - ;str r1, [r12] - ;sub r0, r1, r0, lsr #8 - - ; while sum is signed, sum * sum is always positive and must be treated as - ; unsigned to avoid propagating the sign bit. - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r12] ;store sse - vshr.u32 d10, d10, #8 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - - vpop {q5} - bx lr - - ENDP - -;================================ -;unsigned int vp8_variance16x8_c( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *sse) -|vp8_variance16x8_neon| PROC - vpush {q5} - - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - mov r12, #4 - -variance16x8_neon_loop - vld1.8 {q0}, [r0], r1 ;Load up source and reference - vld1.8 {q2}, [r2], r3 - vld1.8 {q1}, [r0], r1 - vld1.8 {q3}, [r2], r3 - - vsubl.u8 q11, d0, d4 ;calculate diff - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vpadal.s16 q8, q11 ;calculate sum - vmlal.s16 q9, d22, d22 ;calculate sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne variance16x8_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - ldr r12, [sp, #16] ;load *sse from stack - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r12] ;store sse - vshr.u32 d10, d10, #7 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - - vpop {q5} - bx lr - - ENDP - -;================================= -;unsigned int vp8_variance8x16_c( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *sse) - -|vp8_variance8x16_neon| PROC - vpush {q5} - - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - mov r12, #8 - -variance8x16_neon_loop - vld1.8 {d0}, [r0], r1 ;Load up source and reference - vld1.8 {d4}, [r2], r3 - vld1.8 {d2}, [r0], r1 - vld1.8 {d6}, [r2], r3 - - vsubl.u8 q11, d0, d4 ;calculate diff - vsubl.u8 q12, d2, d6 - - vpadal.s16 q8, q11 ;calculate sum - vmlal.s16 q9, d22, d22 ;calculate sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - - bne variance8x16_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - ldr r12, [sp, #16] ;load *sse from stack - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r12] ;store sse - vshr.u32 d10, d10, #7 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - - vpop {q5} - bx lr - - ENDP - -;================================== -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance8x8_neon| PROC - vpush {q5} - - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - mov r12, #2 - -variance8x8_neon_loop - vld1.8 {d0}, [r0], r1 ;Load up source and reference - vld1.8 {d4}, [r2], r3 - vld1.8 {d1}, [r0], r1 - vld1.8 {d5}, [r2], r3 - vld1.8 {d2}, [r0], r1 - vld1.8 {d6}, [r2], r3 - vld1.8 {d3}, [r0], r1 - vld1.8 {d7}, [r2], r3 - - vsubl.u8 q11, d0, d4 ;calculate diff - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vpadal.s16 q8, q11 ;calculate sum - vmlal.s16 q9, d22, d22 ;calculate sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne variance8x8_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - ldr r12, [sp, #16] ;load *sse from stack - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r12] ;store sse - vshr.u32 d10, d10, #6 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - - vpop {q5} - bx lr - - ENDP - - END diff --git a/vp8/common/arm/neon/variance_neon.c b/vp8/common/arm/neon/variance_neon.c new file mode 100644 index 000000000..3e25f7ddf --- /dev/null +++ b/vp8/common/arm/neon/variance_neon.c @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +unsigned int vp8_variance16x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance16x8_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { // variance16x8_neon_loop + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance8x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d2u8, d4u8, d6u8; + int16x4_t d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64_t d0s64, d1s64; + uint16x8_t q11u16, q12u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { // variance8x16_neon_loop + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d2u8, d6u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance8x8_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64_t d0s64, d1s64; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 2; i++) { // variance8x8_neon_loop + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d1u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d3u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d5u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d7u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d1u8, d5u8); + q13u16 = vsubl_u8(d2u8, d6u8); + q14u16 = vsubl_u8(d3u8, d7u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index ea6226b2e..52e35f28a 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -161,7 +161,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_ # common (neon) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/buildintrapredictorsmby_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_blk_neon.c -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM) @@ -182,6 +181,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_0_2x_neon.c - +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl)) -- 2.40.0