From d5a52edc117fdedb087034635d601888de553691 Mon Sep 17 00:00:00 2001 From: Parag Salasakar Date: Thu, 31 Oct 2013 12:12:34 +0530 Subject: [PATCH] Added optimized vp9_idct32x32_34_add_dspr2 Change-Id: I2ba9467525b87a8e4a58f0c546e63031b4e38a4e --- vp9/common/mips/dspr2/vp9_itrans32_dspr2.c | 69 +++++++++++++++++++++- vp9/common/vp9_rtcd_defs.sh | 2 +- 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c index d3aee73cb..bc6759400 100644 --- a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c @@ -19,7 +19,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) { +static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; @@ -42,7 +43,7 @@ static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) { const int const_2_power_13 = 8192; const int32_t *input_int; - for (i = 32; i--; ) { + for (i = no_rows; i--; ) { input_int = (const int32_t *)input; if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | @@ -881,12 +882,74 @@ void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, ); // Rows - idct32_1d_rows_dspr2(input, outptr); + idct32_1d_rows_dspr2(input, outptr, 32); // Columns vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride); } +void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + int16_t *outptr = out; + uint32_t i; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // Rows + idct32_1d_rows_dspr2(input, outptr, 8); + + outptr += 8; + __asm__ __volatile__ ( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 8(%[outptr]) \n\t" + "sw $zero, 12(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 24(%[outptr]) \n\t" + "sw $zero, 28(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 40(%[outptr]) \n\t" + "sw $zero, 44(%[outptr]) \n\t" + + : + : [outptr] "r" (outptr) + ); + + for (i = 0; i < 31; ++i) { + outptr += 32; + + __asm__ __volatile__ ( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 8(%[outptr]) \n\t" + "sw $zero, 12(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 24(%[outptr]) \n\t" + "sw $zero, 28(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 40(%[outptr]) \n\t" + "sw $zero, 44(%[outptr]) \n\t" + + : + : [outptr] "r" (outptr) + ); + } + + // Columns + vp9_idct32_1d_cols_add_blk_dspr2(out, dest, stride); +} + void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { int r, out; diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 5e049c63c..3b9775a90 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -296,7 +296,7 @@ prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int specialize vp9_idct32x32_1024_add sse2 neon dspr2 prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct32x32_34_add sse2 +specialize vp9_idct32x32_34_add sse2 dspr2 prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_idct32x32_1_add sse2 dspr2 -- 2.40.0