Move inverse transfrom dspr2 functions from vp9 to vpx_dsp

author Jingning Han <jingning@google.com>

Mon, 3 Aug 2015 17:50:32 +0000 (10:50 -0700)

committer Jingning Han <jingning@google.com>

Mon, 3 Aug 2015 18:59:50 +0000 (11:59 -0700)
author Jingning Han <jingning@google.com>
Mon, 3 Aug 2015 17:50:32 +0000 (10:50 -0700)
committer Jingning Han <jingning@google.com>
Mon, 3 Aug 2015 18:59:50 +0000 (11:59 -0700)
diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c

index aca6550c8dd63255626dff40346072a8f7f7d35b..6ca83a00c5c7ce41a79d80cc6c060636b3fbbefb 100644 (file)
--- a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
@@ -16,1074 +16,11 @@
  #include "vp9/common/vp9_common.h"
  #include "vp9/common/vp9_blockd.h"
  #include "vp9/common/vp9_idct.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
  #include "vpx_dsp/txfm_common.h"
  #include "vpx_ports/mem.h"
  
  #if HAVE_DSPR2
-static void idct16_rows_dspr2(const int16_t *input, int16_t *output,
-                              uint32_t no_rows) {
-  int i;
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  int step1_10, step1_11, step1_12, step1_13;
-  int step2_0, step2_1, step2_2, step2_3;
-  int step2_8, step2_9, step2_10, step2_11;
-  int step2_12, step2_13, step2_14, step2_15;
-  int load1, load2, load3, load4, load5, load6, load7, load8;
-  int result1, result2, result3, result4;
-  const int const_2_power_13 = 8192;
-
-  for (i = no_rows; i--; ) {
-    /* prefetch row */
-    prefetch_load((const uint8_t *)(input + 16));
-
-    __asm__ __volatile__ (
-        "lh       %[load1],              0(%[input])                    \n\t"
-        "lh       %[load2],             16(%[input])                    \n\t"
-        "lh       %[load3],              8(%[input])                    \n\t"
-        "lh       %[load4],             24(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "add      %[result1],           %[load1],       %[load2]        \n\t"
-        "sub      %[result2],           %[load1],       %[load2]        \n\t"
-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
-        "extp     %[step2_0],           $ac1,           31              \n\t"
-        "extp     %[step2_1],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
-        "extp     %[step2_2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
-        "extp     %[step2_3],           $ac1,           31              \n\t"
-
-        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
-        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
-
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
-          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
-          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
-          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
-        "lh       %[load5],             2(%[input])                     \n\t"
-        "lh       %[load6],             30(%[input])                    \n\t"
-        "lh       %[load7],             18(%[input])                    \n\t"
-        "lh       %[load8],             14(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
-        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
-        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
-        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"
-        "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"
-        "extp     %[result4],           $ac2,           31              \n\t"
-
-        "sub      %[load5],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[result4],     %[result3]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_9],           $ac1,           31              \n\t"
-        "extp     %[step2_14],          $ac3,           31              \n\t"
-        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
-        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [load7] "=&r" (load7), [load8] "=&r" (load8),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
-          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
-          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
-
-    __asm__ __volatile__ (
-        "lh       %[load1],             10(%[input])                    \n\t"
-        "lh       %[load2],             22(%[input])                    \n\t"
-        "lh       %[load3],             26(%[input])                    \n\t"
-        "lh       %[load4],             6(%[input])                     \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"
-        "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"
-        "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"
-        "extp     %[result4],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[result2],     %[result1]      \n\t"
-        "sub      %[load2],             %[result4],     %[result3]      \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_10],          $ac1,           31              \n\t"
-        "extp     %[step2_13],          $ac3,           31              \n\t"
-        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
-        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
-
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
-          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
-          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
-
-    __asm__ __volatile__ (
-        "lh       %[load5],             4(%[input])                     \n\t"
-        "lh       %[load6],             28(%[input])                    \n\t"
-        "lh       %[load7],             20(%[input])                    \n\t"
-        "lh       %[load8],             12(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"
-        "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"
-        "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"
-        "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"
-        "extp     %[result4],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[result4],     %[result3]      \n\t"
-        "sub      %[load5],             %[load5],       %[result1]      \n\t"
-        "add      %[load5],             %[load5],       %[result2]      \n\t"
-
-        "sub      %[load6],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[load6],       %[result3]      \n\t"
-        "add      %[load6],             %[load6],       %[result4]      \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_5],           $ac1,           31              \n\t"
-        "extp     %[step1_6],           $ac3,           31              \n\t"
-        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
-        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [load7] "=&r" (load7), [load8] "=&r" (load8),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
-          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
-
-        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
-
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_10],          $ac0,           31              \n\t"
-        "extp     %[step1_13],          $ac1,           31              \n\t"
-        "extp     %[step1_11],          $ac2,           31              \n\t"
-        "extp     %[step1_12],          $ac3,           31              \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
-          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
-          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
-          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
-          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
-        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_12]     \n\t"
-        "add      %[load5],             %[load5],       %[step2_15]     \n\t"
-        "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"
-        "add      %[load6],             %[load6],       %[step2_13]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_14]     \n\t"
-        "sh       %[load5],             0(%[output])                    \n\t"
-        "sh       %[load6],             32(%[output])                   \n\t"
-        "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_9]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
-        "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"
-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
-        "add      %[load6],             %[load6],       %[step2_11]     \n\t"
-        "sh       %[load5],             192(%[output])                  \n\t"
-        "sh       %[load6],             224(%[output])                  \n\t"
-        "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"
-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
-        "sub      %[load5],             %[load5],       %[step2_11]     \n\t"
-        "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"
-        "sub      %[load6],             %[load6],       %[step2_9]      \n\t"
-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
-        "sh       %[load5],             256(%[output])                  \n\t"
-        "sh       %[load6],             288(%[output])                  \n\t"
-        "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"
-        "sub      %[load5],             %[load5],       %[step2_13]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_14]     \n\t"
-        "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"
-        "sub      %[load6],             %[load6],       %[step2_12]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_15]     \n\t"
-        "sh       %[load5],             448(%[output])                  \n\t"
-        "sh       %[load6],             480(%[output])                  \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6)
-        : [output] "r" (output),
-          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
-          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
-          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
-          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
-          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
-          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
-    );
-
-    __asm__ __volatile__ (
-        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
-        "add      %[load5],             %[load5],       %[step1_13]     \n\t"
-        "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
-        "add      %[load6],             %[load6],       %[step1_12]     \n\t"
-        "sh       %[load5],             64(%[output])                   \n\t"
-        "sh       %[load6],             96(%[output])                   \n\t"
-        "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"
-        "add      %[load5],             %[load5],       %[step1_11]     \n\t"
-        "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"
-        "add      %[load6],             %[load6],       %[step1_10]     \n\t"
-        "sh       %[load5],             128(%[output])                  \n\t"
-        "sh       %[load6],             160(%[output])                  \n\t"
-        "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"
-        "sub      %[load5],             %[load5],       %[step1_10]     \n\t"
-        "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"
-        "sub      %[load6],             %[load6],       %[step1_11]     \n\t"
-        "sh       %[load5],             320(%[output])                  \n\t"
-        "sh       %[load6],             352(%[output])                  \n\t"
-        "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"
-        "sub      %[load5],             %[load5],       %[step1_12]     \n\t"
-        "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"
-        "sub      %[load6],             %[load6],       %[step1_13]     \n\t"
-        "sh       %[load5],             384(%[output])                  \n\t"
-        "sh       %[load6],             416(%[output])                  \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6)
-        : [output] "r" (output),
-          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
-          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
-          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
-          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
-    );
-
-    input += 16;
-    output += 1;
-  }
-}
-
-static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                      int dest_stride) {
-  int i;
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  int step1_8, step1_9, step1_10, step1_11;
-  int step1_12, step1_13, step1_14, step1_15;
-  int step2_0, step2_1, step2_2, step2_3;
-  int step2_8, step2_9, step2_10, step2_11;
-  int step2_12, step2_13, step2_14, step2_15;
-  int load1, load2, load3, load4, load5, load6, load7, load8;
-  int result1, result2, result3, result4;
-  const int const_2_power_13 = 8192;
-  uint8_t *dest_pix;
-  uint8_t *cm = vpx_ff_cropTbl;
-
-  /* prefetch vpx_ff_cropTbl */
-  prefetch_load(vpx_ff_cropTbl);
-  prefetch_load(vpx_ff_cropTbl +  32);
-  prefetch_load(vpx_ff_cropTbl +  64);
-  prefetch_load(vpx_ff_cropTbl +  96);
-  prefetch_load(vpx_ff_cropTbl + 128);
-  prefetch_load(vpx_ff_cropTbl + 160);
-  prefetch_load(vpx_ff_cropTbl + 192);
-  prefetch_load(vpx_ff_cropTbl + 224);
-
-  for (i = 0; i < 16; ++i) {
-    dest_pix = (dest + i);
-    __asm__ __volatile__ (
-        "lh       %[load1],              0(%[input])                    \n\t"
-        "lh       %[load2],             16(%[input])                    \n\t"
-        "lh       %[load3],              8(%[input])                    \n\t"
-        "lh       %[load4],             24(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "add      %[result1],           %[load1],       %[load2]        \n\t"
-        "sub      %[result2],           %[load1],       %[load2]        \n\t"
-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
-        "extp     %[step2_0],           $ac1,           31              \n\t"
-        "extp     %[step2_1],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
-        "extp     %[step2_2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
-        "extp     %[step2_3],           $ac1,           31              \n\t"
-
-        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
-        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
-
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
-          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
-          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
-          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
-        "lh       %[load5],             2(%[input])                     \n\t"
-        "lh       %[load6],             30(%[input])                    \n\t"
-        "lh       %[load7],             18(%[input])                    \n\t"
-        "lh       %[load8],             14(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
-        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
-        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
-        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"
-        "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"
-        "extp     %[result4],           $ac2,            31             \n\t"
-
-        "sub      %[load5],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[result4],     %[result3]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_9],           $ac1,           31              \n\t"
-        "extp     %[step2_14],          $ac3,           31              \n\t"
-        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
-        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [load7] "=&r" (load7), [load8] "=&r" (load8),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
-          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
-          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
-
-    __asm__ __volatile__ (
-        "lh       %[load1],             10(%[input])                    \n\t"
-        "lh       %[load2],             22(%[input])                    \n\t"
-        "lh       %[load3],             26(%[input])                    \n\t"
-        "lh       %[load4],             6(%[input])                     \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"
-        "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"
-        "extp     %[result1],           $ac1,        31                 \n\t"
-
-        "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"
-        "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"
-        "extp     %[result2],           $ac3,        31                 \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"
-        "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"
-        "extp     %[result3],           $ac1,        31                 \n\t"
-
-        "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"
-        "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"
-        "extp     %[result4],           $ac2,        31                 \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[result2],     %[result1]      \n\t"
-        "sub      %[load2],             %[result4],     %[result3]      \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_10],          $ac1,           31              \n\t"
-        "extp     %[step2_13],          $ac3,           31              \n\t"
-        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
-        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
-
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
-          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
-          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
-
-    __asm__ __volatile__ (
-        "lh       %[load5],             4(%[input])                   \n\t"
-        "lh       %[load6],             28(%[input])                  \n\t"
-        "lh       %[load7],             20(%[input])                  \n\t"
-        "lh       %[load8],             12(%[input])                  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
-        "mthi     $zero,                $ac1                          \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                          \n\t"
-        "mthi     $zero,                $ac3                          \n\t"
-
-        "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"
-        "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"
-        "extp     %[result1],           $ac1,        31               \n\t"
-
-        "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"
-        "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"
-        "extp     %[result2],           $ac3,        31               \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
-        "mthi     $zero,                $ac1                          \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                          \n\t"
-        "mthi     $zero,                $ac2                          \n\t"
-
-        "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"
-        "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"
-        "extp     %[result3],           $ac1,        31               \n\t"
-
-        "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"
-        "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"
-        "extp     %[result4],           $ac2,        31               \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[result4],     %[result3]      \n\t"
-        "sub      %[load5],             %[load5],       %[result1]      \n\t"
-        "add      %[load5],             %[load5],       %[result2]      \n\t"
-
-        "sub      %[load6],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[load6],       %[result3]      \n\t"
-        "add      %[load6],             %[load6],       %[result4]      \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_5],           $ac1,           31              \n\t"
-        "extp     %[step1_6],           $ac3,           31              \n\t"
-
-        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
-        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [load7] "=&r" (load7), [load8] "=&r" (load8),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
-          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
-
-        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
-
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_10],          $ac0,           31              \n\t"
-        "extp     %[step1_13],          $ac1,           31              \n\t"
-        "extp     %[step1_11],          $ac2,           31              \n\t"
-        "extp     %[step1_12],          $ac3,           31              \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
-          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
-          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
-          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
-          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    step1_8 = step2_8 + step2_11;
-    step1_9 = step2_9 + step2_10;
-    step1_14 = step2_13 + step2_14;
-    step1_15 = step2_12 + step2_15;
-
-    __asm__ __volatile__ (
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
-        "add      %[load5],         %[load5],           %[step1_15]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_14]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
-        "add      %[load5],         %[load5],           %[step1_13]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_12]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
-        "add      %[load5],         %[load5],           %[step1_11]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_10]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[load5],           %[step1_9]      \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_8]      \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_8]      \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_10]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_12]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_14]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
-          [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
-          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
-          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
-          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
-          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
-          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
-          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
-          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
-          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
-    );
-
-    input += 16;
-  }
-}
-
-void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp    %[pos],    1    \n\t"
-    :
-    : [pos] "r" (pos)
-  );
-
-  // First transform rows
-  idct16_rows_dspr2(input, out, 16);
-
-  // Then transform columns and add to dest
-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
-}
-
-static void iadst16_dspr2(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
-
-  int x0 = input[15];
-  int x1 = input[0];
-  int x2 = input[13];
-  int x3 = input[2];
-  int x4 = input[11];
-  int x5 = input[4];
-  int x6 = input[9];
-  int x7 = input[6];
-  int x8 = input[7];
-  int x9 = input[8];
-  int x10 = input[5];
-  int x11 = input[10];
-  int x12 = input[3];
-  int x13 = input[12];
-  int x14 = input[1];
-  int x15 = input[14];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
-           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4]
-              = output[5] = output[6] = output[7] = output[8]
-              = output[9] = output[10] = output[11] = output[12]
-              = output[13] = output[14] = output[15] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
-
-  x0 = dct_const_round_shift(s0 + s8);
-  x1 = dct_const_round_shift(s1 + s9);
-  x2 = dct_const_round_shift(s2 + s10);
-  x3 = dct_const_round_shift(s3 + s11);
-  x4 = dct_const_round_shift(s4 + s12);
-  x5 = dct_const_round_shift(s5 + s13);
-  x6 = dct_const_round_shift(s6 + s14);
-  x7 = dct_const_round_shift(s7 + s15);
-  x8  = dct_const_round_shift(s0 - s8);
-  x9  = dct_const_round_shift(s1 - s9);
-  x10 = dct_const_round_shift(s2 - s10);
-  x11 = dct_const_round_shift(s3 - s11);
-  x12 = dct_const_round_shift(s4 - s12);
-  x13 = dct_const_round_shift(s5 - s13);
-  x14 = dct_const_round_shift(s6 - s14);
-  x15 = dct_const_round_shift(s7 - s15);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
-  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
-  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
-  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = s0 + s4;
-  x1 = s1 + s5;
-  x2 = s2 + s6;
-  x3 = s3 + s7;
-  x4 = s0 - s4;
-  x5 = s1 - s5;
-  x6 = s2 - s6;
-  x7 = s3 - s7;
-  x8 = dct_const_round_shift(s8 + s12);
-  x9 = dct_const_round_shift(s9 + s13);
-  x10 = dct_const_round_shift(s10 + s14);
-  x11 = dct_const_round_shift(s11 + s15);
-  x12 = dct_const_round_shift(s8 - s12);
-  x13 = dct_const_round_shift(s9 - s13);
-  x14 = dct_const_round_shift(s10 - s14);
-  x15 = dct_const_round_shift(s11 - s15);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
-
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
-  x8 = s8 + s10;
-  x9 = s9 + s11;
-  x10 = s8 - s10;
-  x11 = s9 - s11;
-  x12 = dct_const_round_shift(s12 + s14);
-  x13 = dct_const_round_shift(s13 + s15);
-  x14 = dct_const_round_shift(s12 - s14);
-  x15 = dct_const_round_shift(s13 - s15);
-
-  // stage 4
-  s2 = (- cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (- x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (- x10 + x11);
-  s14 = (- cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
-  x10 = dct_const_round_shift(s10);
-  x11 = dct_const_round_shift(s11);
-  x14 = dct_const_round_shift(s14);
-  x15 = dct_const_round_shift(s15);
-
-  output[0] =  x0;
-  output[1] = -x8;
-  output[2] =  x12;
-  output[3] = -x4;
-  output[4] =  x6;
-  output[5] =  x14;
-  output[6] =  x10;
-  output[7] =  x2;
-  output[8] =  x3;
-  output[9] =  x11;
-  output[10] =  x15;
-  output[11] =  x7;
-  output[12] =  x5;
-  output[13] = -x13;
-  output[14] =  x9;
-  output[15] = -x1;
-}
-
  void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
                                  int pitch, int tx_type) {
    int i, j;
@@ -1168,150 +105,4 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
        break;
    }
  }
-
-void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
-                                int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
-  int16_t *outptr = out;
-  uint32_t i;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp    %[pos],    1    \n\t"
-    :
-    : [pos] "r" (pos)
-  );
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  idct16_rows_dspr2(input, outptr, 4);
-
-  outptr += 4;
-  for (i = 0; i < 6; ++i) {
-    __asm__ __volatile__ (
-        "sw     $zero,    0(%[outptr])     \n\t"
-        "sw     $zero,   32(%[outptr])     \n\t"
-        "sw     $zero,   64(%[outptr])     \n\t"
-        "sw     $zero,   96(%[outptr])     \n\t"
-        "sw     $zero,  128(%[outptr])     \n\t"
-        "sw     $zero,  160(%[outptr])     \n\t"
-        "sw     $zero,  192(%[outptr])     \n\t"
-        "sw     $zero,  224(%[outptr])     \n\t"
-        "sw     $zero,  256(%[outptr])     \n\t"
-        "sw     $zero,  288(%[outptr])     \n\t"
-        "sw     $zero,  320(%[outptr])     \n\t"
-        "sw     $zero,  352(%[outptr])     \n\t"
-        "sw     $zero,  384(%[outptr])     \n\t"
-        "sw     $zero,  416(%[outptr])     \n\t"
-        "sw     $zero,  448(%[outptr])     \n\t"
-        "sw     $zero,  480(%[outptr])     \n\t"
-
-        :
-        : [outptr] "r" (outptr)
-    );
-
-    outptr += 2;
-  }
-
-  // Then transform columns
-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
-}
-
-void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                               int dest_stride) {
-  uint32_t pos = 45;
-  int32_t out;
-  int32_t r;
-  int32_t a1, absa1;
-  int32_t vector_a1;
-  int32_t t1, t2, t3, t4;
-  int32_t vector_1, vector_2, vector_3, vector_4;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-
-    :
-    : [pos] "r" (pos)
-  );
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
-  __asm__ __volatile__ (
-      "addi     %[out],     %[out],     32      \n\t"
-      "sra      %[a1],      %[out],     6       \n\t"
-
-      : [out] "+r" (out), [a1] "=r" (a1)
-      :
-  );
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "abs        %[absa1],       %[a1]       \n\t"
-        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
-
-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
-
-    for (r = 16; r--;) {
-      __asm__ __volatile__ (
-          "lw             %[t1],          0(%[dest])                      \n\t"
-          "lw             %[t2],          4(%[dest])                      \n\t"
-          "lw             %[t3],          8(%[dest])                      \n\t"
-          "lw             %[t4],          12(%[dest])                     \n\t"
-          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    0(%[dest])                      \n\t"
-          "sw             %[vector_2],    4(%[dest])                      \n\t"
-          "sw             %[vector_3],    8(%[dest])                      \n\t"
-          "sw             %[vector_4],    12(%[dest])                     \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "replv.qb   %[vector_a1],   %[a1]   \n\t"
-
-        : [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
-
-    for (r = 16; r--;) {
-      __asm__ __volatile__ (
-          "lw             %[t1],          0(%[dest])                      \n\t"
-          "lw             %[t2],          4(%[dest])                      \n\t"
-          "lw             %[t3],          8(%[dest])                      \n\t"
-          "lw             %[t4],          12(%[dest])                     \n\t"
-          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    0(%[dest])                      \n\t"
-          "sw             %[vector_2],    4(%[dest])                      \n\t"
-          "sw             %[vector_3],    8(%[dest])                      \n\t"
-          "sw             %[vector_4],    12(%[dest])                     \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
-    }
-  }
-}
  #endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c

index aa801ecb67eedecfda45691173ef245bdd0a32f8..848f7c0aa4df09a346bf25bc20bebcb4a8ce600d 100644 (file)
--- a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -16,354 +16,11 @@
  #include "vp9/common/vp9_common.h"
  #include "vp9/common/vp9_blockd.h"
  #include "vp9/common/vp9_idct.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
  #include "vpx_dsp/txfm_common.h"
  #include "vpx_ports/mem.h"
  
  #if HAVE_DSPR2
-static void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
-  int16_t   step_0, step_1, step_2, step_3;
-  int       Temp0, Temp1, Temp2, Temp3;
-  const int const_2_power_13 = 8192;
-  int       i;
-
-  for (i = 4; i--; ) {
-    __asm__ __volatile__ (
-        /*
-          temp_1 = (input[0] + input[2]) * cospi_16_64;
-          step_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[2]) * cospi_16_64;
-          step_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             4(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "extp     %[step_0],            $ac0,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "extp     %[step_1],            $ac1,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        /*
-          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-          step_2 = dct_const_round_shift(temp1);
-        */
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "extp     %[step_2],            $ac0,           31              \n\t"
-
-        /*
-          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-          step_3 = dct_const_round_shift(temp2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[step_3],            $ac1,           31              \n\t"
-
-        /*
-          output[0]  = step_0 + step_3;
-          output[4]  = step_1 + step_2;
-          output[8]  = step_1 - step_2;
-          output[12] = step_0 - step_3;
-        */
-        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "sh       %[Temp0],             0(%[output])                    \n\t"
-
-        "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"
-        "sh       %[Temp1],             8(%[output])                    \n\t"
-
-        "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"
-        "sh       %[Temp2],             16(%[output])                   \n\t"
-
-        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
-        "sh       %[Temp3],             24(%[output])                   \n\t"
-
-      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
-        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
-        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
-        [output] "+r" (output)
-      : [const_2_power_13] "r" (const_2_power_13),
-        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
-        [cospi_24_64] "r" (cospi_24_64),
-        [input] "r" (input)
-    );
-
-    input += 4;
-    output += 1;
-  }
-}
-
-static void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                               int dest_stride) {
-  int16_t   step_0, step_1, step_2, step_3;
-  int       Temp0, Temp1, Temp2, Temp3;
-  const int const_2_power_13 = 8192;
-  int       i;
-  uint8_t   *dest_pix;
-  uint8_t   *cm = vpx_ff_cropTbl;
-
-  /* prefetch vpx_ff_cropTbl */
-  prefetch_load(vpx_ff_cropTbl);
-  prefetch_load(vpx_ff_cropTbl +  32);
-  prefetch_load(vpx_ff_cropTbl +  64);
-  prefetch_load(vpx_ff_cropTbl +  96);
-  prefetch_load(vpx_ff_cropTbl + 128);
-  prefetch_load(vpx_ff_cropTbl + 160);
-  prefetch_load(vpx_ff_cropTbl + 192);
-  prefetch_load(vpx_ff_cropTbl + 224);
-
-  for (i = 0; i < 4; ++i) {
-      dest_pix = (dest + i);
-
-    __asm__ __volatile__ (
-        /*
-          temp_1 = (input[0] + input[2]) * cospi_16_64;
-          step_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[2]) * cospi_16_64;
-          step_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             4(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "extp     %[step_0],            $ac0,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "extp     %[step_1],            $ac1,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        /*
-          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-          step_2 = dct_const_round_shift(temp1);
-        */
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "extp     %[step_2],            $ac0,           31              \n\t"
-
-        /*
-          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-          step_3 = dct_const_round_shift(temp2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[step_3],            $ac1,           31              \n\t"
-
-        /*
-          output[0]  = step_0 + step_3;
-          output[4]  = step_1 + step_2;
-          output[8]  = step_1 - step_2;
-          output[12] = step_0 - step_3;
-        */
-        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-
-      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
-        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
-        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
-        [dest_pix] "+r" (dest_pix)
-      : [const_2_power_13] "r" (const_2_power_13),
-        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
-        [cospi_24_64] "r" (cospi_24_64),
-        [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
-    );
-
-    input += 4;
-  }
-}
-
-void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-    :
-    : [pos] "r" (pos)
-  );
-
-  // Rows
-  vp9_idct4_rows_dspr2(input, outptr);
-
-  // Columns
-  vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride) {
-  int       a1, absa1;
-  int       r;
-  int32_t   out;
-  int       t2, vector_a1, vector_a;
-  uint32_t  pos = 45;
-  int16_t   input_dc = input[0];
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-
-    :
-    : [pos] "r" (pos)
-  );
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
-  __asm__ __volatile__ (
-      "addi     %[out],     %[out],    8       \n\t"
-      "sra      %[a1],      %[out],    4       \n\t"
-
-      : [out] "+r" (out), [a1] "=r" (a1)
-      :
-  );
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "abs        %[absa1],     %[a1]         \n\t"
-        "replv.qb   %[vector_a1], %[absa1]      \n\t"
-
-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
-
-    for (r = 4; r--;) {
-      __asm__ __volatile__ (
-          "lw             %[t2],          0(%[dest])                      \n\t"
-          "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
-          "sw             %[vector_a],    0(%[dest])                      \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "replv.qb       %[vector_a1],   %[a1]     \n\t"
-        : [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
-
-    for (r = 4; r--;) {
-      __asm__ __volatile__ (
-          "lw           %[t2],          0(%[dest])                        \n\t"
-          "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
-          "sw           %[vector_a],    0(%[dest])                        \n\t"
-          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
-
-          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
-    }
-  }
-}
-
-static void iadst4_dspr2(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-  int x0, x1, x2, x3;
-
-  x0 = input[0];
-  x1 = input[1];
-  x2 = input[2];
-  x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = x0 - x2 + x3;
-
-  x0 = s0 + s3 + s5;
-  x1 = s1 - s4 - s6;
-  x2 = sinpi_3_9 * s7;
-  x3 = s2;
-
-  s0 = x0 + x3;
-  s1 = x1 + x3;
-  s2 = x2;
-  s3 = x0 + x1 - x3;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = dct_const_round_shift(s0);
-  output[1] = dct_const_round_shift(s1);
-  output[2] = dct_const_round_shift(s2);
-  output[3] = dct_const_round_shift(s3);
-}
-
  void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
                               int dest_stride, int tx_type) {
    int i, j;
diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c

index 5270fa17f76df0a52b34359d74bb498ed425debe..37f3ca9fcbb7227e27bbc9ebf6684f38b6bdc8da 100644 (file)
--- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -15,538 +15,11 @@
  #include "./vp9_rtcd.h"
  #include "vp9/common/vp9_common.h"
  #include "vp9/common/vp9_blockd.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
  #include "vpx_dsp/txfm_common.h"
  #include "vpx_ports/mem.h"
  
  #if HAVE_DSPR2
-static void idct8_rows_dspr2(const int16_t *input, int16_t *output,
-                             uint32_t no_rows) {
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  const int const_2_power_13 = 8192;
-  int Temp0, Temp1, Temp2, Temp3, Temp4;
-  int i;
-
-  for (i = no_rows; i--; ) {
-    __asm__ __volatile__ (
-        /*
-          temp_1 = (input[0] + input[4]) * cospi_16_64;
-          step2_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[4]) * cospi_16_64;
-          step2_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             8(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "extp     %[Temp4],             $ac0,           31              \n\t"
-
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "extp     %[Temp2],             $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
-          step2_2 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             4(%[input])                     \n\t"
-        "lh       %[Temp1],             12(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "extp     %[Temp3],             $ac0,           31              \n\t"
-
-        /*
-          step1_1 = step2_1 + step2_2;
-          step1_2 = step2_1 - step2_2;
-        */
-        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
-        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
-
-        /*
-          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
-          step2_3 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[Temp1],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        /*
-          step1_0 = step2_0 + step2_3;
-          step1_3 = step2_0 - step2_3;
-        */
-        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
-        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
-
-        /*
-          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-          step1_4 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp1],             14(%[input])                    \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
-        "extp     %[step1_4],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-          step1_7 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
-        "extp     %[step1_7],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-          step1_5 = dct_const_round_shift(temp_1);
-        */
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-          step1_6 = dct_const_round_shift(temp_2);
-        */
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
-          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
-        */
-        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
-        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
-        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
-        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
-
-        /*
-          step1_4 = step1_4 + step1_5;
-          step1_7 = step1_6 + step1_7;
-        */
-        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
-        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
-
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "sh       %[Temp0],             0(%[output])                    \n\t"
-        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
-        "sh       %[Temp1],             16(%[output])                   \n\t"
-        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "sh       %[Temp0],             32(%[output])                   \n\t"
-        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
-        "sh       %[Temp1],             48(%[output])                   \n\t"
-
-        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "sh       %[Temp0],             64(%[output])                   \n\t"
-        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
-        "sh       %[Temp1],             80(%[output])                   \n\t"
-        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "sh       %[Temp0],             96(%[output])                   \n\t"
-        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
-        "sh       %[Temp1],             112(%[output])                  \n\t"
-
-        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
-          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
-          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
-          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
-          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
-          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-          [Temp4] "=&r" (Temp4)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
-          [cospi_24_64] "r" (cospi_24_64),
-          [output] "r" (output), [input] "r" (input)
-    );
-
-    input += 8;
-    output += 1;
-  }
-}
-
-static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                        int dest_stride) {
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  int Temp0, Temp1, Temp2, Temp3;
-  int i;
-  const int const_2_power_13 = 8192;
-  uint8_t *dest_pix;
-  uint8_t *cm = vpx_ff_cropTbl;
-
-  /* prefetch vpx_ff_cropTbl */
-  prefetch_load(vpx_ff_cropTbl);
-  prefetch_load(vpx_ff_cropTbl +  32);
-  prefetch_load(vpx_ff_cropTbl +  64);
-  prefetch_load(vpx_ff_cropTbl +  96);
-  prefetch_load(vpx_ff_cropTbl + 128);
-  prefetch_load(vpx_ff_cropTbl + 160);
-  prefetch_load(vpx_ff_cropTbl + 192);
-  prefetch_load(vpx_ff_cropTbl + 224);
-
-  for (i = 0; i < 8; ++i) {
-      dest_pix = (dest + i);
-
-    __asm__ __volatile__ (
-        /*
-          temp_1 = (input[0] + input[4]) * cospi_16_64;
-          step2_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[4]) * cospi_16_64;
-          step2_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             8(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_6],           $ac0,           31              \n\t"
-
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "extp     %[Temp2],             $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
-          step2_2 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             4(%[input])                     \n\t"
-        "lh       %[Temp1],             12(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "extp     %[Temp3],             $ac0,           31              \n\t"
-
-        /*
-          step1_1 = step2_1 + step2_2;
-          step1_2 = step2_1 - step2_2;
-        */
-        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
-        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
-
-        /*
-          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
-          step2_3 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[Temp1],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        /*
-          step1_0 = step2_0 + step2_3;
-          step1_3 = step2_0 - step2_3;
-        */
-        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
-        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
-
-        /*
-          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-          step1_4 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp1],             14(%[input])                    \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
-        "extp     %[step1_4],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-          step1_7 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
-        "extp     %[step1_7],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-          step1_5 = dct_const_round_shift(temp_1);
-        */
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-          step1_6 = dct_const_round_shift(temp_2);
-        */
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
-          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
-        */
-        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
-        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
-        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
-        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
-
-        /*
-          step1_4 = step1_4 + step1_5;
-          step1_7 = step1_6 + step1_7;
-        */
-        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
-        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
-
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        /* add block */
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-
-        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
-          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
-          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
-          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
-          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
-          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-          [dest_pix] "+r" (dest_pix)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
-          [cospi_24_64] "r" (cospi_24_64),
-          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
-    );
-
-    input += 8;
-  }
-}
-
-void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp    %[pos],    1    \n\t"
-    :
-    : [pos] "r" (pos)
-  );
-
-  // First transform rows
-  idct8_rows_dspr2(input, outptr, 8);
-
-  // Then transform columns and add to dest
-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-static void iadst8_dspr2(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-  int x0, x1, x2, x3, x4, x5, x6, x7;
-
-  x0 = input[7];
-  x1 = input[0];
-  x2 = input[5];
-  x3 = input[2];
-  x4 = input[3];
-  x5 = input[4];
-  x6 = input[1];
-  x7 = input[6];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4]
-              = output[5] = output[6] = output[7] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
-
-  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
-  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
-  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
-  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
-  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
-  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
-  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
-  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
-  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
-  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
-
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
-  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
-  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
-  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
-
-  // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
-
-  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
-  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
-  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
-  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
-
-  output[0] =  x0;
-  output[1] = -x4;
-  output[2] =  x6;
-  output[3] = -x2;
-  output[4] =  x3;
-  output[5] = -x7;
-  output[6] =  x5;
-  output[7] = -x1;
-}
-
  void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
                               int dest_stride, int tx_type) {
    int i, j;
@@ -617,130 +90,4 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
        break;
    }
  }
-
-void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp    %[pos],    1    \n\t"
-    :
-    : [pos] "r" (pos)
-  );
-
-  // First transform rows
-  idct8_rows_dspr2(input, outptr, 4);
-
-  outptr += 4;
-
-  __asm__ __volatile__ (
-      "sw  $zero,   0(%[outptr])  \n\t"
-      "sw  $zero,   4(%[outptr])  \n\t"
-      "sw  $zero,  16(%[outptr])  \n\t"
-      "sw  $zero,  20(%[outptr])  \n\t"
-      "sw  $zero,  32(%[outptr])  \n\t"
-      "sw  $zero,  36(%[outptr])  \n\t"
-      "sw  $zero,  48(%[outptr])  \n\t"
-      "sw  $zero,  52(%[outptr])  \n\t"
-      "sw  $zero,  64(%[outptr])  \n\t"
-      "sw  $zero,  68(%[outptr])  \n\t"
-      "sw  $zero,  80(%[outptr])  \n\t"
-      "sw  $zero,  84(%[outptr])  \n\t"
-      "sw  $zero,  96(%[outptr])  \n\t"
-      "sw  $zero, 100(%[outptr])  \n\t"
-      "sw  $zero, 112(%[outptr])  \n\t"
-      "sw  $zero, 116(%[outptr])  \n\t"
-
-      :
-      : [outptr] "r" (outptr)
-  );
-
-
-  // Then transform columns and add to dest
-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride) {
-  uint32_t pos = 45;
-  int32_t out;
-  int32_t r;
-  int32_t a1, absa1;
-  int32_t t1, t2, vector_a1, vector_1, vector_2;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-
-    :
-    : [pos] "r" (pos)
-  );
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
-  __asm__ __volatile__ (
-      "addi     %[out],     %[out],     16      \n\t"
-      "sra      %[a1],      %[out],     5       \n\t"
-
-      : [out] "+r" (out), [a1] "=r" (a1)
-      :
-  );
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "abs        %[absa1],       %[a1]       \n\t"
-        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
-
-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
-
-    for (r = 8; r--;) {
-      __asm__ __volatile__ (
-          "lw           %[t1],          0(%[dest])                      \n\t"
-          "lw           %[t2],          4(%[dest])                      \n\t"
-          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "sw           %[vector_1],    0(%[dest])                      \n\t"
-          "sw           %[vector_2],    4(%[dest])                      \n\t"
-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r" (t1), [t2] "=&r" (t2),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "replv.qb   %[vector_a1],   %[a1]   \n\t"
-
-        : [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
-
-    for (r = 8; r--;) {
-      __asm__ __volatile__ (
-          "lw           %[t1],          0(%[dest])                      \n\t"
-          "lw           %[t2],          4(%[dest])                      \n\t"
-          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "sw           %[vector_1],    0(%[dest])                      \n\t"
-          "sw           %[vector_2],    4(%[dest])                      \n\t"
-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r" (t1), [t2] "=&r" (t2),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [dest] "+r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
-    }
-  }
-}
  #endif  // #if HAVE_DSPR2
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk

index f47d56a5a4f1f0a7aa3c55f1e8f4a949ce32ab87..735aaf14184a6e760aebf5922d872b8708e4665a 100644 (file)
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -71,15 +71,10 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
  VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
  endif
  
-# common (c)
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_common_dspr2.h
-
  ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
  VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans4_dspr2.c
  VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans8_dspr2.c
  VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_cols_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_dspr2.c
  endif
  
  # common (msa)
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vpx_dsp/mips/inv_txfm_dspr2.h

similarity index 75%

rename from vp9/common/mips/dspr2/vp9_common_dspr2.h

rename to vpx_dsp/mips/inv_txfm_dspr2.h

index f8690fd61d6fe3ac88825261ddc483e93149f30a..537830b50b6b970e1401a6827b722896c25c58f7 100644 (file)
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vpx_dsp/mips/inv_txfm_dspr2.h
@@ -8,13 +8,14 @@
   *  be found in the AUTHORS file in the root of the source tree.
   */
  
-#ifndef VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_
-#define VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_
+#ifndef VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
+#define VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
  
  #include <assert.h>
  
  #include "./vpx_config.h"
  #include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
  #include "vpx_dsp/mips/common_dspr2.h"
  
  #ifdef __cplusplus
@@ -50,10 +51,23 @@ extern "C" {
  
  void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
                                     int dest_stride);
+void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output);
+void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                     int dest_stride);
+void iadst4_dspr2(const int16_t *input, int16_t *output);
+void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                 int dest_stride);
+void iadst8_dspr2(const int16_t *input, int16_t *output);
+void idct16_rows_dspr2(const int16_t *input, int16_t *output,
+                       uint32_t no_rows);
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                               int dest_stride);
+void iadst16_dspr2(const int16_t *input, int16_t *output);
  
  #endif  // #if HAVE_DSPR2
  #ifdef __cplusplus
  }  // extern "C"
  #endif
  
-#endif  // VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_
+#endif  // VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
diff --git a/vpx_dsp/mips/itrans16_dspr2.c b/vpx_dsp/mips/itrans16_dspr2.c

new file mode 100644 (file)

index 0000000..0e6e759
--- /dev/null
+++ b/vpx_dsp/mips/itrans16_dspr2.c
@@ -0,0 +1,1227 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void idct16_rows_dspr2(const int16_t *input, int16_t *output,
+                       uint32_t no_rows) {
+  int i;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int step1_10, step1_11, step1_12, step1_13;
+  int step2_0, step2_1, step2_2, step2_3;
+  int step2_8, step2_9, step2_10, step2_11;
+  int step2_12, step2_13, step2_14, step2_15;
+  int load1, load2, load3, load4, load5, load6, load7, load8;
+  int result1, result2, result3, result4;
+  const int const_2_power_13 = 8192;
+
+  for (i = no_rows; i--; ) {
+    /* prefetch row */
+    prefetch_load((const uint8_t *)(input + 16));
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              0(%[input])                    \n\t"
+        "lh       %[load2],             16(%[input])                    \n\t"
+        "lh       %[load3],              8(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[step2_0],           $ac1,           31              \n\t"
+        "extp     %[step2_1],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[step2_3],           $ac1,           31              \n\t"
+
+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             2(%[input])                     \n\t"
+        "lh       %[load6],             30(%[input])                    \n\t"
+        "lh       %[load7],             18(%[input])                    \n\t"
+        "lh       %[load8],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"
+        "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "sub      %[load5],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[result4],     %[result3]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             22(%[input])                    \n\t"
+        "lh       %[load3],             26(%[input])                    \n\t"
+        "lh       %[load4],             6(%[input])                     \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[result2],     %[result1]      \n\t"
+        "sub      %[load2],             %[result4],     %[result3]      \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             4(%[input])                     \n\t"
+        "lh       %[load6],             28(%[input])                    \n\t"
+        "lh       %[load7],             20(%[input])                    \n\t"
+        "lh       %[load8],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"
+        "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[result4],     %[result3]      \n\t"
+        "sub      %[load5],             %[load5],       %[result1]      \n\t"
+        "add      %[load5],             %[load5],       %[result2]      \n\t"
+
+        "sub      %[load6],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[load6],       %[result3]      \n\t"
+        "add      %[load6],             %[load6],       %[result4]      \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+
+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_10],          $ac0,           31              \n\t"
+        "extp     %[step1_13],          $ac1,           31              \n\t"
+        "extp     %[step1_11],          $ac2,           31              \n\t"
+        "extp     %[step1_12],          $ac3,           31              \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_12]     \n\t"
+        "add      %[load5],             %[load5],       %[step2_15]     \n\t"
+        "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_13]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_14]     \n\t"
+        "sh       %[load5],             0(%[output])                    \n\t"
+        "sh       %[load6],             32(%[output])                   \n\t"
+        "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+        "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "sh       %[load5],             192(%[output])                  \n\t"
+        "sh       %[load6],             224(%[output])                  \n\t"
+        "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_11]     \n\t"
+        "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_9]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "sh       %[load5],             256(%[output])                  \n\t"
+        "sh       %[load6],             288(%[output])                  \n\t"
+        "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_14]     \n\t"
+        "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_15]     \n\t"
+        "sh       %[load5],             448(%[output])                  \n\t"
+        "sh       %[load6],             480(%[output])                  \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6)
+        : [output] "r" (output),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
+    );
+
+    __asm__ __volatile__ (
+        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
+        "add      %[load5],             %[load5],       %[step1_13]     \n\t"
+        "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
+        "add      %[load6],             %[load6],       %[step1_12]     \n\t"
+        "sh       %[load5],             64(%[output])                   \n\t"
+        "sh       %[load6],             96(%[output])                   \n\t"
+        "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"
+        "add      %[load5],             %[load5],       %[step1_11]     \n\t"
+        "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"
+        "add      %[load6],             %[load6],       %[step1_10]     \n\t"
+        "sh       %[load5],             128(%[output])                  \n\t"
+        "sh       %[load6],             160(%[output])                  \n\t"
+        "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"
+        "sub      %[load5],             %[load5],       %[step1_10]     \n\t"
+        "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"
+        "sub      %[load6],             %[load6],       %[step1_11]     \n\t"
+        "sh       %[load5],             320(%[output])                  \n\t"
+        "sh       %[load6],             352(%[output])                  \n\t"
+        "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"
+        "sub      %[load5],             %[load5],       %[step1_12]     \n\t"
+        "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"
+        "sub      %[load6],             %[load6],       %[step1_13]     \n\t"
+        "sh       %[load5],             384(%[output])                  \n\t"
+        "sh       %[load6],             416(%[output])                  \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6)
+        : [output] "r" (output),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
+    );
+
+    input += 16;
+    output += 1;
+  }
+}
+
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                               int dest_stride) {
+  int i;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int step1_8, step1_9, step1_10, step1_11;
+  int step1_12, step1_13, step1_14, step1_15;
+  int step2_0, step2_1, step2_2, step2_3;
+  int step2_8, step2_9, step2_10, step2_11;
+  int step2_12, step2_13, step2_14, step2_15;
+  int load1, load2, load3, load4, load5, load6, load7, load8;
+  int result1, result2, result3, result4;
+  const int const_2_power_13 = 8192;
+  uint8_t *dest_pix;
+  uint8_t *cm = vpx_ff_cropTbl;
+
+  /* prefetch vpx_ff_cropTbl */
+  prefetch_load(vpx_ff_cropTbl);
+  prefetch_load(vpx_ff_cropTbl +  32);
+  prefetch_load(vpx_ff_cropTbl +  64);
+  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 128);
+  prefetch_load(vpx_ff_cropTbl + 160);
+  prefetch_load(vpx_ff_cropTbl + 192);
+  prefetch_load(vpx_ff_cropTbl + 224);
+
+  for (i = 0; i < 16; ++i) {
+    dest_pix = (dest + i);
+    __asm__ __volatile__ (
+        "lh       %[load1],              0(%[input])                    \n\t"
+        "lh       %[load2],             16(%[input])                    \n\t"
+        "lh       %[load3],              8(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[step2_0],           $ac1,           31              \n\t"
+        "extp     %[step2_1],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[step2_3],           $ac1,           31              \n\t"
+
+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             2(%[input])                     \n\t"
+        "lh       %[load6],             30(%[input])                    \n\t"
+        "lh       %[load7],             18(%[input])                    \n\t"
+        "lh       %[load8],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"
+        "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"
+        "extp     %[result4],           $ac2,            31             \n\t"
+
+        "sub      %[load5],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[result4],     %[result3]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             22(%[input])                    \n\t"
+        "lh       %[load3],             26(%[input])                    \n\t"
+        "lh       %[load4],             6(%[input])                     \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"
+        "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"
+        "extp     %[result1],           $ac1,        31                 \n\t"
+
+        "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"
+        "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"
+        "extp     %[result2],           $ac3,        31                 \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"
+        "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"
+        "extp     %[result3],           $ac1,        31                 \n\t"
+
+        "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"
+        "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"
+        "extp     %[result4],           $ac2,        31                 \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[result2],     %[result1]      \n\t"
+        "sub      %[load2],             %[result4],     %[result3]      \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             4(%[input])                   \n\t"
+        "lh       %[load6],             28(%[input])                  \n\t"
+        "lh       %[load7],             20(%[input])                  \n\t"
+        "lh       %[load8],             12(%[input])                  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
+        "mthi     $zero,                $ac1                          \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                          \n\t"
+        "mthi     $zero,                $ac3                          \n\t"
+
+        "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"
+        "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"
+        "extp     %[result1],           $ac1,        31               \n\t"
+
+        "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"
+        "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"
+        "extp     %[result2],           $ac3,        31               \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
+        "mthi     $zero,                $ac1                          \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                          \n\t"
+        "mthi     $zero,                $ac2                          \n\t"
+
+        "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"
+        "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"
+        "extp     %[result3],           $ac1,        31               \n\t"
+
+        "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"
+        "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"
+        "extp     %[result4],           $ac2,        31               \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[result4],     %[result3]      \n\t"
+        "sub      %[load5],             %[load5],       %[result1]      \n\t"
+        "add      %[load5],             %[load5],       %[result2]      \n\t"
+
+        "sub      %[load6],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[load6],       %[result3]      \n\t"
+        "add      %[load6],             %[load6],       %[result4]      \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+
+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+
+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_10],          $ac0,           31              \n\t"
+        "extp     %[step1_13],          $ac1,           31              \n\t"
+        "extp     %[step1_11],          $ac2,           31              \n\t"
+        "extp     %[step1_12],          $ac3,           31              \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step1_8 = step2_8 + step2_11;
+    step1_9 = step2_9 + step2_10;
+    step1_14 = step2_13 + step2_14;
+    step1_15 = step2_12 + step2_15;
+
+    __asm__ __volatile__ (
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_15]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_14]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_13]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_12]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_11]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_10]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[load5],           %[step1_9]      \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_8]      \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_8]      \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_10]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_12]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_14]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
+          [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
+    );
+
+    input += 16;
+  }
+}
+
+void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+                                 int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct16_rows_dspr2(input, out, 16);
+
+  // Then transform columns and add to dest
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  int16_t *outptr = out;
+  uint32_t i;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  idct16_rows_dspr2(input, outptr, 4);
+
+  outptr += 4;
+  for (i = 0; i < 6; ++i) {
+    __asm__ __volatile__ (
+        "sw     $zero,    0(%[outptr])     \n\t"
+        "sw     $zero,   32(%[outptr])     \n\t"
+        "sw     $zero,   64(%[outptr])     \n\t"
+        "sw     $zero,   96(%[outptr])     \n\t"
+        "sw     $zero,  128(%[outptr])     \n\t"
+        "sw     $zero,  160(%[outptr])     \n\t"
+        "sw     $zero,  192(%[outptr])     \n\t"
+        "sw     $zero,  224(%[outptr])     \n\t"
+        "sw     $zero,  256(%[outptr])     \n\t"
+        "sw     $zero,  288(%[outptr])     \n\t"
+        "sw     $zero,  320(%[outptr])     \n\t"
+        "sw     $zero,  352(%[outptr])     \n\t"
+        "sw     $zero,  384(%[outptr])     \n\t"
+        "sw     $zero,  416(%[outptr])     \n\t"
+        "sw     $zero,  448(%[outptr])     \n\t"
+        "sw     $zero,  480(%[outptr])     \n\t"
+
+        :
+        : [outptr] "r" (outptr)
+    );
+
+    outptr += 2;
+  }
+
+  // Then transform columns
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                               int dest_stride) {
+  uint32_t pos = 45;
+  int32_t out;
+  int32_t r;
+  int32_t a1, absa1;
+  int32_t vector_a1;
+  int32_t t1, t2, t3, t4;
+  int32_t vector_1, vector_2, vector_3, vector_4;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],     32      \n\t"
+      "sra      %[a1],      %[out],     6       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],       %[a1]       \n\t"
+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+
+void iadst16_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+
+  int x0 = input[15];
+  int x1 = input[0];
+  int x2 = input[13];
+  int x3 = input[2];
+  int x4 = input[11];
+  int x5 = input[4];
+  int x6 = input[9];
+  int x7 = input[6];
+  int x8 = input[7];
+  int x9 = input[8];
+  int x10 = input[5];
+  int x11 = input[10];
+  int x12 = input[3];
+  int x13 = input[12];
+  int x14 = input[1];
+  int x15 = input[14];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = output[8]
+              = output[9] = output[10] = output[11] = output[12]
+              = output[13] = output[14] = output[15] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = dct_const_round_shift(s0 + s8);
+  x1 = dct_const_round_shift(s1 + s9);
+  x2 = dct_const_round_shift(s2 + s10);
+  x3 = dct_const_round_shift(s3 + s11);
+  x4 = dct_const_round_shift(s4 + s12);
+  x5 = dct_const_round_shift(s5 + s13);
+  x6 = dct_const_round_shift(s6 + s14);
+  x7 = dct_const_round_shift(s7 + s15);
+  x8  = dct_const_round_shift(s0 - s8);
+  x9  = dct_const_round_shift(s1 - s9);
+  x10 = dct_const_round_shift(s2 - s10);
+  x11 = dct_const_round_shift(s3 - s11);
+  x12 = dct_const_round_shift(s4 - s12);
+  x13 = dct_const_round_shift(s5 - s13);
+  x14 = dct_const_round_shift(s6 - s14);
+  x15 = dct_const_round_shift(s7 - s15);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
+  x4 = s0 - s4;
+  x5 = s1 - s5;
+  x6 = s2 - s6;
+  x7 = s3 - s7;
+  x8 = dct_const_round_shift(s8 + s12);
+  x9 = dct_const_round_shift(s9 + s13);
+  x10 = dct_const_round_shift(s10 + s14);
+  x11 = dct_const_round_shift(s11 + s15);
+  x12 = dct_const_round_shift(s8 - s12);
+  x13 = dct_const_round_shift(s9 - s13);
+  x14 = dct_const_round_shift(s10 - s14);
+  x15 = dct_const_round_shift(s11 - s15);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = dct_const_round_shift(s4 + s6);
+  x5 = dct_const_round_shift(s5 + s7);
+  x6 = dct_const_round_shift(s4 - s6);
+  x7 = dct_const_round_shift(s5 - s7);
+  x8 = s8 + s10;
+  x9 = s9 + s11;
+  x10 = s8 - s10;
+  x11 = s9 - s11;
+  x12 = dct_const_round_shift(s12 + s14);
+  x13 = dct_const_round_shift(s13 + s15);
+  x14 = dct_const_round_shift(s12 - s14);
+  x15 = dct_const_round_shift(s13 - s15);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = dct_const_round_shift(s2);
+  x3 = dct_const_round_shift(s3);
+  x6 = dct_const_round_shift(s6);
+  x7 = dct_const_round_shift(s7);
+  x10 = dct_const_round_shift(s10);
+  x11 = dct_const_round_shift(s11);
+  x14 = dct_const_round_shift(s14);
+  x15 = dct_const_round_shift(s15);
+
+  output[0] =  x0;
+  output[1] = -x8;
+  output[2] =  x12;
+  output[3] = -x4;
+  output[4] =  x6;
+  output[5] =  x14;
+  output[6] =  x10;
+  output[7] =  x2;
+  output[8] =  x3;
+  output[9] =  x11;
+  output[10] =  x15;
+  output[11] =  x7;
+  output[12] =  x5;
+  output[13] = -x13;
+  output[14] =  x9;
+  output[15] = -x1;
+}
+
+
+#endif  // HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vpx_dsp/mips/itrans32_cols_dspr2.c

similarity index 99%

rename from vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c

rename to vpx_dsp/mips/itrans32_cols_dspr2.c

index 48da85cbce37dcd8c4aaaf6e239f8ad113c3e9f2..c9cda52e359cab9338b1dc5d9db45bc8451e562e 100644 (file)
--- a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
+++ b/vpx_dsp/mips/itrans32_cols_dspr2.c
@@ -8,15 +8,9 @@
   *  be found in the AUTHORS file in the root of the source tree.
   */
  
-#include <assert.h>
-
  #include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
  #include "vpx_dsp/txfm_common.h"
-#include "vpx_ports/mem.h"
  
  #if HAVE_DSPR2
  void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vpx_dsp/mips/itrans32_dspr2.c

similarity index 99%

rename from vp9/common/mips/dspr2/vp9_itrans32_dspr2.c

rename to vpx_dsp/mips/itrans32_dspr2.c

index b4b0d248c696280c0b9a709fb1929e8c5ab8dffb..25966346c16c191b8c70d98c3bc084e83c393744 100644 (file)
--- a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
+++ b/vpx_dsp/mips/itrans32_dspr2.c
@@ -12,10 +12,7 @@
  #include <stdio.h>
  
  #include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
  #include "vpx_dsp/txfm_common.h"
  
  #if HAVE_DSPR2
diff --git a/vpx_dsp/mips/itrans4_dspr2.c b/vpx_dsp/mips/itrans4_dspr2.c

new file mode 100644 (file)

index 0000000..b48e73b
--- /dev/null
+++ b/vpx_dsp/mips/itrans4_dspr2.c
@@ -0,0 +1,359 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
+  int16_t   step_0, step_1, step_2, step_3;
+  int       Temp0, Temp1, Temp2, Temp3;
+  const int const_2_power_13 = 8192;
+  int       i;
+
+  for (i = 4; i--; ) {
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[2]) * cospi_16_64;
+          step_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[2]) * cospi_16_64;
+          step_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             4(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "extp     %[step_0],            $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "extp     %[step_1],            $ac1,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        /*
+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+          step_2 = dct_const_round_shift(temp1);
+        */
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step_2],            $ac0,           31              \n\t"
+
+        /*
+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+          step_3 = dct_const_round_shift(temp2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step_3],            $ac1,           31              \n\t"
+
+        /*
+          output[0]  = step_0 + step_3;
+          output[4]  = step_1 + step_2;
+          output[8]  = step_1 - step_2;
+          output[12] = step_0 - step_3;
+        */
+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "sh       %[Temp0],             0(%[output])                    \n\t"
+
+        "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"
+        "sh       %[Temp1],             8(%[output])                    \n\t"
+
+        "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"
+        "sh       %[Temp2],             16(%[output])                   \n\t"
+
+        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
+        "sh       %[Temp3],             24(%[output])                   \n\t"
+
+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+        [output] "+r" (output)
+      : [const_2_power_13] "r" (const_2_power_13),
+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+        [cospi_24_64] "r" (cospi_24_64),
+        [input] "r" (input)
+    );
+
+    input += 4;
+    output += 1;
+  }
+}
+
+void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                     int dest_stride) {
+  int16_t   step_0, step_1, step_2, step_3;
+  int       Temp0, Temp1, Temp2, Temp3;
+  const int const_2_power_13 = 8192;
+  int       i;
+  uint8_t   *dest_pix;
+  uint8_t   *cm = vpx_ff_cropTbl;
+
+  /* prefetch vpx_ff_cropTbl */
+  prefetch_load(vpx_ff_cropTbl);
+  prefetch_load(vpx_ff_cropTbl +  32);
+  prefetch_load(vpx_ff_cropTbl +  64);
+  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 128);
+  prefetch_load(vpx_ff_cropTbl + 160);
+  prefetch_load(vpx_ff_cropTbl + 192);
+  prefetch_load(vpx_ff_cropTbl + 224);
+
+  for (i = 0; i < 4; ++i) {
+      dest_pix = (dest + i);
+
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[2]) * cospi_16_64;
+          step_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[2]) * cospi_16_64;
+          step_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             4(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "extp     %[step_0],            $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "extp     %[step_1],            $ac1,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        /*
+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+          step_2 = dct_const_round_shift(temp1);
+        */
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step_2],            $ac0,           31              \n\t"
+
+        /*
+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+          step_3 = dct_const_round_shift(temp2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step_3],            $ac1,           31              \n\t"
+
+        /*
+          output[0]  = step_0 + step_3;
+          output[4]  = step_1 + step_2;
+          output[8]  = step_1 - step_2;
+          output[12] = step_0 - step_3;
+        */
+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+
+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+        [dest_pix] "+r" (dest_pix)
+      : [const_2_power_13] "r" (const_2_power_13),
+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+        [cospi_24_64] "r" (cospi_24_64),
+        [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+    );
+
+    input += 4;
+  }
+}
+
+void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  vp9_idct4_rows_dspr2(input, outptr);
+
+  // Columns
+  vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride) {
+  int       a1, absa1;
+  int       r;
+  int32_t   out;
+  int       t2, vector_a1, vector_a;
+  uint32_t  pos = 45;
+  int16_t   input_dc = input[0];
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],    8       \n\t"
+      "sra      %[a1],      %[out],    4       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],     %[a1]         \n\t"
+        "replv.qb   %[vector_a1], %[absa1]      \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t2],          0(%[dest])                      \n\t"
+          "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
+          "sw             %[vector_a],    0(%[dest])                      \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb       %[vector_a1],   %[a1]     \n\t"
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t2],          0(%[dest])                        \n\t"
+          "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
+          "sw           %[vector_a],    0(%[dest])                        \n\t"
+          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
+
+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+
+void iadst4_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+  int x0, x1, x2, x3;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = x0 - x2 + x3;
+
+  x0 = s0 + s3 + s5;
+  x1 = s1 - s4 - s6;
+  x2 = sinpi_3_9 * s7;
+  x3 = s2;
+
+  s0 = x0 + x3;
+  s1 = x1 + x3;
+  s2 = x2;
+  s3 = x0 + x1 - x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = dct_const_round_shift(s0);
+  output[1] = dct_const_round_shift(s1);
+  output[2] = dct_const_round_shift(s2);
+  output[3] = dct_const_round_shift(s3);
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vpx_dsp/mips/itrans8_dspr2.c b/vpx_dsp/mips/itrans8_dspr2.c

new file mode 100644 (file)

index 0000000..d3baba9
--- /dev/null
+++ b/vpx_dsp/mips/itrans8_dspr2.c
@@ -0,0 +1,668 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  const int const_2_power_13 = 8192;
+  int Temp0, Temp1, Temp2, Temp3, Temp4;
+  int i;
+
+  for (i = no_rows; i--; ) {
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[4]) * cospi_16_64;
+          step2_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[4]) * cospi_16_64;
+          step2_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             8(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "extp     %[Temp4],             $ac0,           31              \n\t"
+
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "extp     %[Temp2],             $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+          step2_2 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             4(%[input])                     \n\t"
+        "lh       %[Temp1],             12(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "extp     %[Temp3],             $ac0,           31              \n\t"
+
+        /*
+          step1_1 = step2_1 + step2_2;
+          step1_2 = step2_1 - step2_2;
+        */
+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
+
+        /*
+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+          step2_3 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[Temp1],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        /*
+          step1_0 = step2_0 + step2_3;
+          step1_3 = step2_0 - step2_3;
+        */
+        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
+        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
+
+        /*
+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+          step1_4 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp1],             14(%[input])                    \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
+        "extp     %[step1_4],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+          step1_7 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
+        "extp     %[step1_7],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+          step1_5 = dct_const_round_shift(temp_1);
+        */
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+          step1_6 = dct_const_round_shift(temp_2);
+        */
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+        */
+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
+
+        /*
+          step1_4 = step1_4 + step1_5;
+          step1_7 = step1_6 + step1_7;
+        */
+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
+
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "sh       %[Temp0],             0(%[output])                    \n\t"
+        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
+        "sh       %[Temp1],             16(%[output])                   \n\t"
+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "sh       %[Temp0],             32(%[output])                   \n\t"
+        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
+        "sh       %[Temp1],             48(%[output])                   \n\t"
+
+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "sh       %[Temp0],             64(%[output])                   \n\t"
+        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
+        "sh       %[Temp1],             80(%[output])                   \n\t"
+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "sh       %[Temp0],             96(%[output])                   \n\t"
+        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
+        "sh       %[Temp1],             112(%[output])                  \n\t"
+
+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [Temp4] "=&r" (Temp4)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_24_64] "r" (cospi_24_64),
+          [output] "r" (output), [input] "r" (input)
+    );
+
+    input += 8;
+    output += 1;
+  }
+}
+
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                 int dest_stride) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int Temp0, Temp1, Temp2, Temp3;
+  int i;
+  const int const_2_power_13 = 8192;
+  uint8_t *dest_pix;
+  uint8_t *cm = vpx_ff_cropTbl;
+
+  /* prefetch vpx_ff_cropTbl */
+  prefetch_load(vpx_ff_cropTbl);
+  prefetch_load(vpx_ff_cropTbl +  32);
+  prefetch_load(vpx_ff_cropTbl +  64);
+  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 128);
+  prefetch_load(vpx_ff_cropTbl + 160);
+  prefetch_load(vpx_ff_cropTbl + 192);
+  prefetch_load(vpx_ff_cropTbl + 224);
+
+  for (i = 0; i < 8; ++i) {
+      dest_pix = (dest + i);
+
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[4]) * cospi_16_64;
+          step2_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[4]) * cospi_16_64;
+          step2_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             8(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_6],           $ac0,           31              \n\t"
+
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "extp     %[Temp2],             $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+          step2_2 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             4(%[input])                     \n\t"
+        "lh       %[Temp1],             12(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "extp     %[Temp3],             $ac0,           31              \n\t"
+
+        /*
+          step1_1 = step2_1 + step2_2;
+          step1_2 = step2_1 - step2_2;
+        */
+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
+
+        /*
+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+          step2_3 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[Temp1],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        /*
+          step1_0 = step2_0 + step2_3;
+          step1_3 = step2_0 - step2_3;
+        */
+        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
+        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
+
+        /*
+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+          step1_4 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp1],             14(%[input])                    \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
+        "extp     %[step1_4],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+          step1_7 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
+        "extp     %[step1_7],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+          step1_5 = dct_const_round_shift(temp_1);
+        */
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+          step1_6 = dct_const_round_shift(temp_2);
+        */
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+        */
+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
+
+        /*
+          step1_4 = step1_4 + step1_5;
+          step1_7 = step1_6 + step1_7;
+        */
+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
+
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /* add block */
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+
+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [dest_pix] "+r" (dest_pix)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_24_64] "r" (cospi_24_64),
+          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+    );
+
+    input += 8;
+  }
+}
+
+void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct8_rows_dspr2(input, outptr, 8);
+
+  // Then transform columns and add to dest
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct8_rows_dspr2(input, outptr, 4);
+
+  outptr += 4;
+
+  __asm__ __volatile__ (
+      "sw  $zero,   0(%[outptr])  \n\t"
+      "sw  $zero,   4(%[outptr])  \n\t"
+      "sw  $zero,  16(%[outptr])  \n\t"
+      "sw  $zero,  20(%[outptr])  \n\t"
+      "sw  $zero,  32(%[outptr])  \n\t"
+      "sw  $zero,  36(%[outptr])  \n\t"
+      "sw  $zero,  48(%[outptr])  \n\t"
+      "sw  $zero,  52(%[outptr])  \n\t"
+      "sw  $zero,  64(%[outptr])  \n\t"
+      "sw  $zero,  68(%[outptr])  \n\t"
+      "sw  $zero,  80(%[outptr])  \n\t"
+      "sw  $zero,  84(%[outptr])  \n\t"
+      "sw  $zero,  96(%[outptr])  \n\t"
+      "sw  $zero, 100(%[outptr])  \n\t"
+      "sw  $zero, 112(%[outptr])  \n\t"
+      "sw  $zero, 116(%[outptr])  \n\t"
+
+      :
+      : [outptr] "r" (outptr)
+  );
+
+
+  // Then transform columns and add to dest
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride) {
+  uint32_t pos = 45;
+  int32_t out;
+  int32_t r;
+  int32_t a1, absa1;
+  int32_t t1, t2, vector_a1, vector_1, vector_2;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],     16      \n\t"
+      "sra      %[a1],      %[out],     5       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],       %[a1]       \n\t"
+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t1],          0(%[dest])                      \n\t"
+          "lw           %[t2],          4(%[dest])                      \n\t"
+          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "sw           %[vector_1],    0(%[dest])                      \n\t"
+          "sw           %[vector_2],    4(%[dest])                      \n\t"
+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t1],          0(%[dest])                      \n\t"
+          "lw           %[t2],          4(%[dest])                      \n\t"
+          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "sw           %[vector_1],    0(%[dest])                      \n\t"
+          "sw           %[vector_2],    4(%[dest])                      \n\t"
+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [dest] "+r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+
+void iadst8_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+  int x0, x1, x2, x3, x4, x5, x6, x7;
+
+  x0 = input[7];
+  x1 = input[0];
+  x2 = input[5];
+  x3 = input[2];
+  x4 = input[3];
+  x5 = input[4];
+  x6 = input[1];
+  x7 = input[6];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
+  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
+  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
+  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
+  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
+  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
+  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
+  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
+
+  output[0] =  x0;
+  output[1] = -x4;
+  output[2] =  x6;
+  output[3] = -x2;
+  output[4] =  x3;
+  output[5] = -x7;
+  output[6] =  x5;
+  output[7] = -x1;
+}
+#endif  // HAVE_DSPR2
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk

index 4158d64874b08bdd8bbc7151738b085ad3b6b6f1..6c2e520eed6d045f35602d28af5ecf38535861b1 100644 (file)
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -213,6 +213,13 @@ DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
  DSP_SRCS-$(HAVE_MSA)   += mips/idct8x8_msa.c
  DSP_SRCS-$(HAVE_MSA)   += mips/idct16x16_msa.c
  DSP_SRCS-$(HAVE_MSA)   += mips/idct32x32_msa.c
+
+DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
  endif  # CONFIG_VP9
  
  # quantization
author	Jingning Han <jingning@google.com>
	Mon, 3 Aug 2015 17:50:32 +0000 (10:50 -0700)
committer	Jingning Han <jingning@google.com>
	Mon, 3 Aug 2015 18:59:50 +0000 (11:59 -0700)
vp9/common/mips/dspr2/vp9_itrans16_dspr2.c		patch \| blob \| history
vp9/common/mips/dspr2/vp9_itrans4_dspr2.c		patch \| blob \| history
vp9/common/mips/dspr2/vp9_itrans8_dspr2.c		patch \| blob \| history
vp9/vp9_common.mk		patch \| blob \| history
vpx_dsp/mips/inv_txfm_dspr2.h	[moved from vp9/common/mips/dspr2/vp9_common_dspr2.h with 75% similarity]	patch \| blob \| history
vpx_dsp/mips/itrans16_dspr2.c	[new file with mode: 0644]	patch \| blob
vpx_dsp/mips/itrans32_cols_dspr2.c	[moved from vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c with 99% similarity]	patch \| blob \| history
vpx_dsp/mips/itrans32_dspr2.c	[moved from vp9/common/mips/dspr2/vp9_itrans32_dspr2.c with 99% similarity]	patch \| blob \| history
vpx_dsp/mips/itrans4_dspr2.c	[new file with mode: 0644]	patch \| blob
vpx_dsp/mips/itrans8_dspr2.c	[new file with mode: 0644]	patch \| blob
vpx_dsp/vpx_dsp.mk		patch \| blob \| history