From cb14764fab88b5b28ba09fa9490bd72c017cb7c2 Mon Sep 17 00:00:00 2001
From: Tero Rintaluoma <teror@google.com>
Date: Wed, 9 Feb 2011 09:34:56 -0500
Subject: [PATCH] Adds armv6 optimized variance calculation

Adds vp8_sub_pixel_variance16x16_armv6 function to encoder. Integrates
ARMv6 optimized bilinear interpolations from vp8/common/arm/armv6
and adds new assembly file for variance16x16 calculation.
 - vp8_filter_block2d_bil_first_pass_armv6   (integrated)
 - vp8_filter_block2d_bil_second_pass_armv6  (integrated)
 - vp8_variance16x16_armv6 (new)
 - bilinearfilter_arm.h (new)
Change-Id: I18a8331ce7d031ceedd6cd415ecacb0c8f3392db
---
 vp8/common/arm/bilinearfilter_arm.c           |  21 +--
 vp8/common/arm/bilinearfilter_arm.h           |  35 +++++
 vp8/encoder/arm/arm_csystemdependent.c        |   8 +-
 .../arm/armv6/vp8_variance16x16_armv6.asm     | 147 ++++++++++++++++++
 vp8/encoder/arm/variance_arm.c                |  34 ++++
 vp8/encoder/arm/variance_arm.h                |  17 ++
 vp8/vp8_common.mk                             |   1 +
 vp8/vp8cx_arm.mk                              |   4 +-
 8 files changed, 242 insertions(+), 25 deletions(-)
 create mode 100644 vp8/common/arm/bilinearfilter_arm.h
 create mode 100644 vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm

diff --git a/vp8/common/arm/bilinearfilter_arm.c b/vp8/common/arm/bilinearfilter_arm.c
index 961d142c9..6a46ef685 100644
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ b/vp8/common/arm/bilinearfilter_arm.c
@@ -12,26 +12,7 @@
 #include <math.h>
 #include "filter.h"
 #include "subpixel.h"
-
-extern void vp8_filter_block2d_bil_first_pass_armv6
-(
-    unsigned char  *src_ptr,
-    unsigned short *dst_ptr,
-    unsigned int    src_pitch,
-    unsigned int    height,
-    unsigned int    width,
-    const short    *vp8_filter
-);
-
-extern void vp8_filter_block2d_bil_second_pass_armv6
-(
-    unsigned short *src_ptr,
-    unsigned char  *dst_ptr,
-    int             dst_pitch,
-    unsigned int    height,
-    unsigned int    width,
-    const short    *vp8_filter
-);
+#include "arm/bilinearfilter_arm.h"
 
 void vp8_filter_block2d_bil_armv6
 (
diff --git a/vp8/common/arm/bilinearfilter_arm.h b/vp8/common/arm/bilinearfilter_arm.h
new file mode 100644
index 000000000..b7155d3f0
--- /dev/null
+++ b/vp8/common/arm/bilinearfilter_arm.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BILINEARFILTER_ARM_H
+#define BILINEARFILTER_ARM_H
+
+extern void vp8_filter_block2d_bil_first_pass_armv6
+(
+    const unsigned char  *src_ptr,
+    unsigned short       *dst_ptr,
+    unsigned int          src_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short          *vp8_filter
+);
+
+extern void vp8_filter_block2d_bil_second_pass_armv6
+(
+    const unsigned short *src_ptr,
+    unsigned char        *dst_ptr,
+    int                   dst_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short         *vp8_filter
+);
+
+#endif /* BILINEARFILTER_ARM_H */
diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c
index a1f110260..6c17a7984 100644
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -38,14 +38,14 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
         /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
         cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
         cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
-        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
-        cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;*/
+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;*/
+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_armv6;
 
         /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
         cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
         cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
-        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
-        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;*/
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_armv6;
 
         /*cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
         cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
diff --git a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
new file mode 100644
index 000000000..8d7258af7
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -0,0 +1,147 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance16x16_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0x0]      ; load 4 src pixels
+    ldr     r5, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #0x4]      ; load 4 src pixels
+    ldr     r5, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #0x8]      ; load 4 src pixels
+    ldr     r5, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #0xc]      ; load 4 src pixels
+    ldr     r5, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #0x28]     ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/encoder/arm/variance_arm.c
index b40c0482f..9737bef46 100644
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -10,6 +10,40 @@
 
 #include "vpx_config.h"
 #include "variance.h"
+#include "filter.h"
+#include "arm/bilinearfilter_arm.h"
+
+#if HAVE_ARMV6
+
+unsigned int vp8_sub_pixel_variance16x16_armv6
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short first_pass[36*16];
+    unsigned char  second_pass[20*16];
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            17, 16, HFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                             16, 16, 16, VFilter);
+
+    return vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
+                                   dst_pixels_per_line, sse);
+}
+
+#endif
 
 #if HAVE_ARMV7
 
diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h
index 3cbacfac3..06d72873e 100644
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h
@@ -12,6 +12,23 @@
 #ifndef VARIANCE_ARM_H
 #define VARIANCE_ARM_H
 
+#if HAVE_ARMV6
+
+extern prototype_variance(vp8_variance16x16_armv6);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6
+
+#undef  vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_armv6
+
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_ARMV6 */
+
 #if HAVE_ARMV7
 extern prototype_sad(vp8_sad4x4_neon);
 extern prototype_sad(vp8_sad8x8_neon);
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 25909456a..af07618a1 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -116,6 +116,7 @@ VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c
 
 # common (c)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.c
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.h
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/filter_arm.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/loopfilter_arm.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/reconintra_arm.c
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index b23ac96ca..abc5dc800 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -17,9 +17,10 @@ VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/arm_csystemdependent.c
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/asm_enc_offsets.c
 
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/encodemb_arm.c
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/variance_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/quantize_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/picklpf_arm.c
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/variance_arm.c
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/variance_arm.h
 VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
 
 VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE)  += encoder/boolhuff.c
@@ -33,6 +34,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_partitions_ar
 
 #File list for armv6
 # encoder
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)
 
 #File list for neon
-- 
2.50.1