From 71bcd9f1af91f2dea22f2e71839039d83e5b1d84 Mon Sep 17 00:00:00 2001 From: Attila Nagy Date: Wed, 9 Mar 2011 14:26:24 +0200 Subject: [PATCH] Add vp8_variance8x8_armv6 and vp8_sub_pixel_variance8x8_armv6 functions Change-Id: I08edaffc62514907fa5e90e1689269e467c857f5 --- vp8/encoder/arm/arm_csystemdependent.c | 12 +-- .../arm/armv6/vp8_variance8x8_armv6.asm | 95 +++++++++++++++++++ vp8/encoder/arm/variance_arm.c | 28 ++++++ vp8/encoder/arm/variance_arm.h | 8 ++ vp8/vp8cx_arm.mk | 1 + 5 files changed, 138 insertions(+), 6 deletions(-) create mode 100644 vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c index 5ba14f375..a661a89a4 100644 --- a/vp8/encoder/arm/arm_csystemdependent.c +++ b/vp8/encoder/arm/arm_csystemdependent.c @@ -35,15 +35,15 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.sad8x8 = vp8_sad8x8_c; cpi->rtcd.variance.sad4x4 = vp8_sad4x4_c;*/ - /*cpi->rtcd.variance.var4x4 = vp8_variance4x4_c; - cpi->rtcd.variance.var8x8 = vp8_variance8x8_c; - cpi->rtcd.variance.var8x16 = vp8_variance8x16_c; + /*cpi->rtcd.variance.var4x4 = vp8_variance4x4_c;*/ + cpi->rtcd.variance.var8x8 = vp8_variance8x8_armv6; + /*cpi->rtcd.variance.var8x16 = vp8_variance8x16_c; cpi->rtcd.variance.var16x8 = vp8_variance16x8_c;*/ cpi->rtcd.variance.var16x16 = vp8_variance16x16_armv6; - /*cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c; - cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_c; - cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c; + /*cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c;*/ + cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_armv6; + /*cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c; cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c;*/ cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_armv6; cpi->rtcd.variance.halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_armv6; diff --git a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm new file mode 100644 index 000000000..7daecb925 --- /dev/null +++ b/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm @@ -0,0 +1,95 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance8x8_armv6| + + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance8x8_armv6| PROC + + push {r4-r10, lr} + mov r12, #8 ; set loop counter to 8 (=block height) + mov r4, #0 ; initialize sum = 0 + mov r5, #0 ; initialize sse = 0 + +loop + ; 1st 4 pixels + ldr r6, [r0, #0x0] ; load 4 src pixels + ldr r7, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r6, r7 ; calculate difference + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; substract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r0, #0x4] ; load 4 src pixels + ldr r7, [r2, #0x4] ; load 4 ref pixels + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r6, r7 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; substract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 ; next row + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + bne loop + + ; return stuff + ldr r8, [sp, #32] ; get address of sse + mul r1, r4, r4 ; sum * sum + str r5, [r8] ; store sse + sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) + + pop {r4-r10, pc} + + ENDP + + END diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/encoder/arm/variance_arm.c index 64d76bcf8..ed1fb16d5 100644 --- a/vp8/encoder/arm/variance_arm.c +++ b/vp8/encoder/arm/variance_arm.c @@ -15,6 +15,34 @@ #if HAVE_ARMV6 +unsigned int vp8_sub_pixel_variance8x8_armv6 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short first_pass[10*8]; + unsigned char second_pass[8*8]; + const short *HFilter, *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, + src_pixels_per_line, + 9, 8, HFilter); + vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, + 8, 8, 8, VFilter); + + return vp8_variance8x8_armv6(second_pass, 8, dst_ptr, + dst_pixels_per_line, sse); +} + unsigned int vp8_sub_pixel_variance16x16_armv6 ( const unsigned char *src_ptr, diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h index 7ad7c76d3..86de27476 100644 --- a/vp8/encoder/arm/variance_arm.h +++ b/vp8/encoder/arm/variance_arm.h @@ -16,7 +16,9 @@ extern prototype_sad(vp8_sad16x16_armv6); extern prototype_variance(vp8_variance16x16_armv6); +extern prototype_variance(vp8_variance8x8_armv6); extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6); +extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_armv6); extern prototype_variance(vp8_variance_halfpixvar16x16_h_armv6); extern prototype_variance(vp8_variance_halfpixvar16x16_v_armv6); extern prototype_variance(vp8_variance_halfpixvar16x16_hv_armv6); @@ -30,12 +32,18 @@ extern prototype_variance(vp8_mse16x16_armv6); #undef vp8_variance_subpixvar16x16 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6 +#undef vp8_variance_subpixvar8x8 +#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_armv6 + #undef vp8_variance_var16x16 #define vp8_variance_var16x16 vp8_variance16x16_armv6 #undef vp8_variance_mse16x16 #define vp8_variance_mse16x16 vp8_mse16x16_armv6 +#undef vp8_variance_var8x8 +#define vp8_variance_var8x8 vp8_variance8x8_armv6 + #undef vp8_variance_halfpixvar16x16_h #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_armv6 diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index b07ee8ffb..a11e1cad2 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -38,6 +38,7 @@ VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM) +VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM) #File list for neon -- 2.40.0