From: Taekhyun Kim Date: Wed, 8 Jun 2011 19:12:45 +0000 (-0700) Subject: utilize preload in ARMv6 MC/LPF/Copy routines X-Git-Tag: v0.9.7~61^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=458fb8f4911a3f53f0601b883069f844c2e40fa5;p=libvpx utilize preload in ARMv6 MC/LPF/Copy routines About 9~10% decoding perf improvement on non-Neon ARM cpus Change-Id: I7dc2a026764e84e9c2faf282b4ae113090326837 --- diff --git a/vp8/common/arm/armv6/bilinearfilter_v6.asm b/vp8/common/arm/armv6/bilinearfilter_v6.asm index a86ed5d0a..9704b4210 100644 --- a/vp8/common/arm/armv6/bilinearfilter_v6.asm +++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm @@ -30,11 +30,11 @@ ldr r4, [sp, #36] ; width mov r12, r3 ; outer-loop counter - sub r2, r2, r4 ; src increment for height loop - ;;IF ARCHITECTURE=6 - pld [r0] - ;;ENDIF + add r7, r2, r4 ; preload next row + pld [r0, r7] + + sub r2, r2, r4 ; src increment for height loop ldr r5, [r11] ; load up filter coefficients @@ -96,9 +96,8 @@ add r0, r0, r2 ; move to next input row subs r12, r12, #1 - ;;IF ARCHITECTURE=6 - pld [r0] - ;;ENDIF + add r9, r2, r4, lsl #1 ; adding back block width + pld [r0, r9] ; preload next row add r11, r11, #2 ; move over to next column mov r1, r11 diff --git a/vp8/common/arm/armv6/copymem16x16_v6.asm b/vp8/common/arm/armv6/copymem16x16_v6.asm index fca91a0db..abf048c2f 100644 --- a/vp8/common/arm/armv6/copymem16x16_v6.asm +++ b/vp8/common/arm/armv6/copymem16x16_v6.asm @@ -22,9 +22,7 @@ ;push {r4-r7} ;preload - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] + pld [r0, #31] ; preload for next 16x16 block ands r4, r0, #15 beq copy_mem16x16_fast @@ -90,6 +88,8 @@ copy_mem16x16_1_loop ldrneb r6, [r0, #2] ldrneb r7, [r0, #3] + pld [r0, #31] ; preload for next 16x16 block + bne copy_mem16x16_1_loop ldmia sp!, {r4 - r7} @@ -121,6 +121,8 @@ copy_mem16x16_4_loop ldrne r6, [r0, #8] ldrne r7, [r0, #12] + pld [r0, #31] ; preload for next 16x16 block + bne copy_mem16x16_4_loop ldmia sp!, {r4 - r7} @@ -148,6 +150,7 @@ copy_mem16x16_8_loop add r2, r2, r3 + pld [r0, #31] ; preload for next 16x16 block bne copy_mem16x16_8_loop ldmia sp!, {r4 - r7} @@ -171,6 +174,7 @@ copy_mem16x16_fast_loop ;stm r2, {r4-r7} add r2, r2, r3 + pld [r0, #31] ; preload for next 16x16 block bne copy_mem16x16_fast_loop ldmia sp!, {r4 - r7} diff --git a/vp8/common/arm/armv6/filter_v6.asm b/vp8/common/arm/armv6/filter_v6.asm index 03b5bccd7..1ba91ddd6 100644 --- a/vp8/common/arm/armv6/filter_v6.asm +++ b/vp8/common/arm/armv6/filter_v6.asm @@ -10,6 +10,8 @@ EXPORT |vp8_filter_block2d_first_pass_armv6| + EXPORT |vp8_filter_block2d_first_pass_16x16_armv6| + EXPORT |vp8_filter_block2d_first_pass_8x8_armv6| EXPORT |vp8_filter_block2d_second_pass_armv6| EXPORT |vp8_filter4_block2d_second_pass_armv6| EXPORT |vp8_filter_block2d_first_pass_only_armv6| @@ -40,11 +42,6 @@ add r12, r3, #16 ; square off the output sub sp, sp, #4 - ;;IF ARCHITECTURE=6 - ;pld [r0, #-2] - ;;pld [r0, #30] - ;;ENDIF - ldr r4, [r11] ; load up packed filter coefficients ldr r5, [r11, #4] ldr r6, [r11, #8] @@ -101,15 +98,10 @@ bne width_loop_1st_6 - ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines - ;;IF ARCHITECTURE=6 - ;pld [r0, r2] - ;;pld [r0, r9] - ;;ENDIF - ldr r1, [sp] ; load and update dst address subs r7, r7, #0x10000 add r0, r0, r2 ; move to next input line + add r1, r1, #2 ; move over to next column str r1, [sp] @@ -120,6 +112,192 @@ ENDP +; -------------------------- +; 16x16 version +; ----------------------------- +|vp8_filter_block2d_first_pass_16x16_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r7, [sp, #36] ; output height + + add r4, r2, #18 ; preload next low + pld [r0, r4] + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_16_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_16_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_16_6 + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + + add r11, r2, #34 ; adding back block width(=16) + pld [r0, r11] ; preload next low + + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_16_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +; -------------------------- +; 8x8 version +; ----------------------------- +|vp8_filter_block2d_first_pass_8x8_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r7, [sp, #36] ; output height + + add r4, r2, #10 ; preload next low + pld [r0, r4] + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_8_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_8_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_8_6 + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + + add r11, r2, #18 ; adding back block width(=8) + pld [r0, r11] ; preload next low + + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_8_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + ;--------------------------------- ; r0 short *src_ptr, ; r1 unsigned char *output_ptr, @@ -262,6 +440,10 @@ |vp8_filter_block2d_first_pass_only_armv6| PROC stmdb sp!, {r4 - r11, lr} + add r7, r2, r3 ; preload next low + add r7, r7, #2 + pld [r0, r7] + ldr r4, [sp, #36] ; output pitch ldr r11, [sp, #40] ; HFilter address sub sp, sp, #8 @@ -330,16 +512,15 @@ bne width_loop_1st_only_6 - ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines - ;;IF ARCHITECTURE=6 - ;pld [r0, r2] - ;;pld [r0, r9] - ;;ENDIF - ldr lr, [sp] ; load back output pitch ldr r12, [sp, #4] ; load back output pitch subs r7, r7, #1 add r0, r0, r12 ; updata src for next loop + + add r11, r12, r3 ; preload next low + add r11, r11, #2 + pld [r0, r11] + add r1, r1, lr ; update dst for next loop bne height_loop_1st_only_6 diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm index b6417dee6..c7441b055 100644 --- a/vp8/common/arm/armv6/loopfilter_v6.asm +++ b/vp8/common/arm/armv6/loopfilter_v6.asm @@ -253,12 +253,6 @@ count RN r5 subs count, count, #1 - ;pld [src] - ;pld [src, pstep] - ;pld [src, pstep, lsl #1] - ;pld [src, pstep, lsl #2] - ;pld [src, pstep, lsl #3] - ldrne r9, [src], pstep ; p3 ldrne r10, [src], pstep ; p2 ldrne r11, [src], pstep ; p1 @@ -857,15 +851,19 @@ count RN r5 sub src, src, #4 ; move src pointer down by 4 ldr count, [sp, #40] ; count for 8-in-parallel ldr r12, [sp, #36] ; load thresh address + pld [src, #23] ; preload for next block sub sp, sp, #16 ; create temp buffer ldr r6, [src], pstep ; load source data ldr r4, [r2], #4 ; flimit + pld [src, #23] ldr r7, [src], pstep ldr r2, [r3], #4 ; limit + pld [src, #23] ldr r8, [src], pstep uadd8 r4, r4, r4 ; flimit * 2 ldr r3, [r12], #4 ; thresh + pld [src, #23] ldr lr, [src], pstep mov count, count, lsl #1 ; 4-in-parallel uadd8 r4, r4, r2 ; flimit * 2 + limit @@ -1242,9 +1240,13 @@ count RN r5 sub src, src, #4 subs count, count, #1 + pld [src, #23] ; preload for next block ldrne r6, [src], pstep ; load source data + pld [src, #23] ldrne r7, [src], pstep + pld [src, #23] ldrne r8, [src], pstep + pld [src, #23] ldrne lr, [src], pstep bne MBVnext8 diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm index 013712036..40a71f49d 100644 --- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm +++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm @@ -154,22 +154,26 @@ pstep RN r1 ; load soure data to r7, r8, r9, r10 ldrh r3, [src, #-2] + pld [src, #23] ; preload for next block ldrh r4, [src], pstep uadd8 r12, r12, r12 ; flimit * 2 ldrh r5, [src, #-2] + pld [src, #23] ldrh r6, [src], pstep uadd8 r12, r12, r7 ; flimit * 2 + limit pkhbt r7, r3, r4, lsl #16 ldrh r3, [src, #-2] + pld [src, #23] ldrh r4, [src], pstep ldr r11, [sp, #40] ; count (r11) for 8-in-parallel pkhbt r8, r5, r6, lsl #16 ldrh r5, [src, #-2] + pld [src, #23] ldrh r6, [src], pstep mov r11, r11, lsl #1 ; 4-in-parallel @@ -259,19 +263,23 @@ pstep RN r1 ; load soure data to r7, r8, r9, r10 ldrneh r3, [src, #-2] + pld [src, #23] ; preload for next block ldrneh r4, [src], pstep ldrneh r5, [src, #-2] + pld [src, #23] ldrneh r6, [src], pstep pkhbt r7, r3, r4, lsl #16 ldrneh r3, [src, #-2] + pld [src, #23] ldrneh r4, [src], pstep pkhbt r8, r5, r6, lsl #16 ldrneh r5, [src, #-2] + pld [src, #23] ldrneh r6, [src], pstep bne simple_vnext8 diff --git a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm index 029e02aa0..3fda1cefa 100644 --- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm +++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm @@ -35,6 +35,9 @@ ldr r12, _filter8_coeff_ sub r0, r0, r1, lsl #1 + add r3, r1, #10 ; preload next low + pld [r0, r3] + add r2, r12, r2, lsl #4 ;calculate filter location add r0, r0, #3 ;adjust src only for loading convinience @@ -110,6 +113,9 @@ add r0, r0, r1 ; move to next input line + add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier + pld [r0, r11] + bne first_pass_hloop_v6 ;second pass filter diff --git a/vp8/common/arm/filter_arm.c b/vp8/common/arm/filter_arm.c index fe3c5a52e..6582fb29a 100644 --- a/vp8/common/arm/filter_arm.c +++ b/vp8/common/arm/filter_arm.c @@ -25,6 +25,28 @@ extern void vp8_filter_block2d_first_pass_armv6 const short *vp8_filter ); +// 8x8 +extern void vp8_filter_block2d_first_pass_8x8_armv6 +( + unsigned char *src_ptr, + short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + const short *vp8_filter +); + +// 16x16 +extern void vp8_filter_block2d_first_pass_16x16_armv6 +( + unsigned char *src_ptr, + short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + const short *vp8_filter +); + extern void vp8_filter_block2d_second_pass_armv6 ( short *src_ptr, @@ -143,12 +165,12 @@ void vp8_sixtap_predict8x8_armv6 { if (yoffset & 0x1) { - vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter); + vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter); vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); } else { - vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter); + vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter); vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); } } @@ -185,12 +207,12 @@ void vp8_sixtap_predict16x16_armv6 { if (yoffset & 0x1) { - vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter); + vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter); vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); } else { - vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter); + vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter); vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); } }