From: Tero Rintaluoma Date: Mon, 30 May 2011 08:10:03 +0000 (+0300) Subject: adds preload for armv6 encoder asm X-Git-Tag: v0.9.7~113^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5305e79eae023f5c38745a6b43116fd1f774597d;p=libvpx adds preload for armv6 encoder asm Added preload instructions to armv6 encoder optimizations. About 5% average speed-up on Tegra2 for VGA@30fps sequence. Change-Id: I41d74737720fb71ce7a316f07555357822f3347e --- diff --git a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm index a9060d76f..000805d4f 100644 --- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm @@ -27,8 +27,11 @@ |vp8_mse16x16_armv6| PROC push {r4-r9, lr} - mov r12, #16 ; set loop counter to 16 (=block height) + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #16 ; set loop counter to 16 (=block height) mov r4, #0 ; initialize sse = 0 loop @@ -39,8 +42,10 @@ loop mov lr, #0 ; constant zero usub8 r8, r5, r6 ; calculate difference + pld [r0, r1, lsl #1] sel r7, r8, lr ; select bytes with positive difference usub8 r9, r6, r5 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] sel r8, r9, lr ; select bytes with negative difference ; calculate partial sums diff --git a/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm index c759f7c65..1b4f5cf3b 100644 --- a/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm @@ -24,6 +24,12 @@ ; stack max_sad (not used) |vp8_sad16x16_armv6| PROC stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + mov r4, #0 ; sad = 0; mov r5, #8 ; loop count @@ -45,6 +51,9 @@ loop add r0, r0, r1 ; set src pointer to next row add r2, r2, r3 ; set dst pointer to next row + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels @@ -70,6 +79,9 @@ loop usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + subs r5, r5, #1 ; decrement loop counter add r4, r4, r8 ; add partial sad values diff --git a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm index 988376390..5feaa8bc2 100644 --- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm @@ -25,6 +25,10 @@ |vp8_variance16x16_armv6| PROC stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + mov r8, #0 ; initialize sum = 0 mov r11, #0 ; initialize sse = 0 mov r12, #16 ; set loop counter to 16 (=block height) @@ -37,8 +41,10 @@ loop mov lr, #0 ; constant zero usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] sel r7, r6, lr ; select bytes with positive difference usub8 r9, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] sel r6, r9, lr ; select bytes with negative difference ; calculate partial sums diff --git a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm index 7daecb925..adc353d20 100644 --- a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm @@ -23,6 +23,10 @@ |vp8_variance8x8_armv6| PROC push {r4-r10, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + mov r12, #8 ; set loop counter to 8 (=block height) mov r4, #0 ; initialize sum = 0 mov r5, #0 ; initialize sse = 0 @@ -35,8 +39,10 @@ loop mov lr, #0 ; constant zero usub8 r8, r6, r7 ; calculate difference + pld [r0, r1, lsl #1] sel r10, r8, lr ; select bytes with positive difference usub8 r9, r7, r6 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] sel r8, r9, lr ; select bytes with negative difference ; calculate partial sums diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm index 2350f3e8b..1b5489795 100644 --- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm @@ -25,6 +25,10 @@ |vp8_variance_halfpixvar16x16_h_armv6| PROC stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + mov r8, #0 ; initialize sum = 0 ldr r10, c80808080 mov r11, #0 ; initialize sse = 0 @@ -42,8 +46,10 @@ loop eor r4, r4, r10 usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] sel r7, r6, lr ; select bytes with positive difference usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] sel r6, r6, lr ; select bytes with negative difference ; calculate partial sums diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm index f9ae3b7e2..38c55edf8 100644 --- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm @@ -25,6 +25,10 @@ |vp8_variance_halfpixvar16x16_hv_armv6| PROC stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + mov r8, #0 ; initialize sum = 0 ldr r10, c80808080 mov r11, #0 ; initialize sse = 0 @@ -53,8 +57,10 @@ loop eor r4, r4, r10 usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] sel r7, r6, lr ; select bytes with positive difference usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] sel r6, r6, lr ; select bytes with negative difference ; calculate partial sums diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm index 9e0a03548..22a50eb00 100644 --- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm @@ -25,6 +25,10 @@ |vp8_variance_halfpixvar16x16_v_armv6| PROC stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + mov r8, #0 ; initialize sum = 0 ldr r10, c80808080 mov r11, #0 ; initialize sse = 0 @@ -43,8 +47,10 @@ loop eor r4, r4, r10 usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] sel r7, r6, lr ; select bytes with positive difference usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] sel r6, r6, lr ; select bytes with negative difference ; calculate partial sums