From 3633ffcbf7b04b15e2dcb6b2834e1685fbbf6bf4 Mon Sep 17 00:00:00 2001 From: hkuang Date: Tue, 21 Jan 2014 14:49:15 -0800 Subject: [PATCH] Add vp9_tm_predictor_16x16 neon implementation which is 3.5 times faster than C. Change-Id: I24439ba7a2971829c11620f34848facf2c916678 --- vp9/common/arm/neon/vp9_reconintra_neon.asm | 97 +++++++++++++++++++++ vp9/common/vp9_rtcd_defs.sh | 2 +- 2 files changed, 98 insertions(+), 1 deletion(-) diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm index 71bf24c9f..ba1aebf31 100644 --- a/vp9/common/arm/neon/vp9_reconintra_neon.asm +++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm @@ -18,6 +18,7 @@ EXPORT |vp9_h_predictor_32x32_neon| EXPORT |vp9_tm_predictor_4x4_neon| EXPORT |vp9_tm_predictor_8x8_neon| + EXPORT |vp9_tm_predictor_16x16_neon| ARM REQUIRE8 PRESERVE8 @@ -403,4 +404,100 @@ loop_h bx lr ENDP ; |vp9_tm_predictor_8x8_neon| +;void vp9_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_tm_predictor_16x16_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + ldrb r12, [r12] + vdup.u8 q0, r12 + + ; Load above 8 pixels + vld1.8 q1, [r2] + + ; preload 8 left into r12 + vld1.8 d18, [r3]! + + ; Compute above - ytop_left + vsubl.u8 q2, d2, d0 + vsubl.u8 q3, d3, d1 + + vmovl.u8 q10, d18 + + ; Load left row by row and compute left + (above - ytop_left) + ; Process 8 rows in each single loop and loop 2 times to process 16 rows. + mov r2, #2 + +loop_16x16_neon + ; Process two rows. + vdup.16 q0, d20[0] + vdup.16 q8, d20[1] + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqshrun.s16 d2, q1, #0 + vqshrun.s16 d3, q0, #0 + vqshrun.s16 d22, q11, #0 + vqshrun.s16 d23, q8, #0 + vdup.16 q0, d20[2] ; proload next 2 rows data + vdup.16 q8, d20[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + ; Process two rows. + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqshrun.s16 d2, q1, #0 + vqshrun.s16 d3, q0, #0 + vqshrun.s16 d22, q11, #0 + vqshrun.s16 d23, q8, #0 + vdup.16 q0, d21[0] ; proload next 2 rows data + vdup.16 q8, d21[1] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqshrun.s16 d2, q1, #0 + vqshrun.s16 d3, q0, #0 + vqshrun.s16 d22, q11, #0 + vqshrun.s16 d23, q8, #0 + vdup.16 q0, d21[2] ; proload next 2 rows data + vdup.16 q8, d21[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqshrun.s16 d2, q1, #0 + vqshrun.s16 d3, q0, #0 + vqshrun.s16 d22, q11, #0 + vqshrun.s16 d23, q8, #0 + vdup.16 q0, d20[2] + vdup.16 q8, d20[3] + vld1.8 d18, [r3]! ; preload 8 left into r12 + vmovl.u8 q10, d18 + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + subs r2, r2, #1 + bgt loop_16x16_neon + + bx lr + ENDP ; |vp9_tm_predictor_16x16_neon| + END diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 9105e9684..91ee22e52 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -135,7 +135,7 @@ prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const ui specialize vp9_v_predictor_16x16 $sse2_x86inc neon prototype void vp9_tm_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_tm_predictor_16x16 $sse2_x86inc +specialize vp9_tm_predictor_16x16 $sse2_x86inc neon prototype void vp9_dc_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_predictor_16x16 $sse2_x86inc dspr2 -- 2.40.0