From 1699eb0bf696da635361616a6a485807c7e6c211 Mon Sep 17 00:00:00 2001 From: Parag Salasakar Date: Thu, 24 Oct 2013 11:29:04 +0530 Subject: [PATCH] mips dsp-ase r2 vp9 decoder idct module optimizations (rebase) Change-Id: Iedcdb8867084f328f4fce2fadb968e0984217308 --- vp9/common/mips/dspr2/vp9_common_dspr2.h | 3 + vp9/common/mips/dspr2/vp9_itrans16_dspr2.c | 1315 +++++++++++++++++ .../mips/dspr2/vp9_itrans32_cols_dspr2.c | 1073 ++++++++++++++ vp9/common/mips/dspr2/vp9_itrans32_dspr2.c | 1013 +++++++++++++ vp9/common/mips/dspr2/vp9_itrans4_dspr2.c | 438 ++++++ vp9/common/mips/dspr2/vp9_itrans8_dspr2.c | 745 ++++++++++ vp9/common/vp9_rtcd_defs.sh | 26 +- vp9/vp9_common.mk | 5 + 8 files changed, 4605 insertions(+), 13 deletions(-) create mode 100644 vp9/common/mips/dspr2/vp9_itrans16_dspr2.c create mode 100644 vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c create mode 100644 vp9/common/mips/dspr2/vp9_itrans32_dspr2.c create mode 100644 vp9/common/mips/dspr2/vp9_itrans4_dspr2.c create mode 100644 vp9/common/mips/dspr2/vp9_itrans8_dspr2.c diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h index dc88f1603..644264f65 100644 --- a/vp9/common/mips/dspr2/vp9_common_dspr2.h +++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h @@ -81,6 +81,9 @@ static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) { ); } +void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); + void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c new file mode 100644 index 000000000..1b2f5506a --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c @@ -0,0 +1,1315 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_10, step1_11, step1_12, step1_13; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + + for (i = no_rows; i--; ) { + /* prefetch row */ + vp9_prefetch_load((const uint8_t *)(input + 16)); + + __asm__ __volatile__ ( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), + [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), + [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), + [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [load7] "=&r" (load7), [load8] "=&r" (load8), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), + [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), + [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), + [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), + [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + __asm__ __volatile__ ( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [load7] "=&r" (load7), [load8] "=&r" (load8), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), + [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), + [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) + : [const_2_power_13] "r" (const_2_power_13), + [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), + [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), + [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), + [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step2_12] \n\t" + "add %[load5], %[load5], %[step2_15] \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step2_13] \n\t" + "add %[load6], %[load6], %[step2_14] \n\t" + "sh %[load5], 0(%[output]) \n\t" + "sh %[load6], 32(%[output]) \n\t" + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "add %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + "add %[load6], %[load6], %[step2_11] \n\t" + "sh %[load5], 192(%[output]) \n\t" + "sh %[load6], 224(%[output]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "sub %[load5], %[load5], %[step2_11] \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step2_9] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "sh %[load5], 256(%[output]) \n\t" + "sh %[load6], 288(%[output]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_14] \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_15] \n\t" + "sh %[load5], 448(%[output]) \n\t" + "sh %[load6], 480(%[output]) \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6) + : [output] "r" (output), + [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), + [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), + [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), + [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), + [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), + [step2_14] "r" (step2_14), [step2_15] "r" (step2_15) + ); + + __asm__ __volatile__ ( + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sh %[load5], 64(%[output]) \n\t" + "sh %[load6], 96(%[output]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sh %[load5], 128(%[output]) \n\t" + "sh %[load6], 160(%[output]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sh %[load5], 320(%[output]) \n\t" + "sh %[load6], 352(%[output]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sh %[load5], 384(%[output]) \n\t" + "sh %[load6], 416(%[output]) \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6) + : [output] "r" (output), + [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), + [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), + [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), + [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) + ); + + input += 16; + output += 1; + } +} + +static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_8, step1_9, step1_10, step1_11; + int step1_12, step1_13, step1_14, step1_15; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + uint8_t *dest_pix; + uint8_t *cm = vp9_ff_cropTbl; + + /* prefetch vp9_ff_cropTbl */ + vp9_prefetch_load(vp9_ff_cropTbl); + vp9_prefetch_load(vp9_ff_cropTbl + 32); + vp9_prefetch_load(vp9_ff_cropTbl + 64); + vp9_prefetch_load(vp9_ff_cropTbl + 96); + vp9_prefetch_load(vp9_ff_cropTbl + 128); + vp9_prefetch_load(vp9_ff_cropTbl + 160); + vp9_prefetch_load(vp9_ff_cropTbl + 192); + vp9_prefetch_load(vp9_ff_cropTbl + 224); + + for (i = 0; i < 16; ++i) { + dest_pix = (dest + i); + __asm__ __volatile__ ( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), + [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), + [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), + [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [load7] "=&r" (load7), [load8] "=&r" (load8), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), + [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), + [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), + [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), + [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + __asm__ __volatile__ ( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [load7] "=&r" (load7), [load8] "=&r" (load8), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), + [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), + [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) + : [const_2_power_13] "r" (const_2_power_13), + [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), + [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), + [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), + [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), + [cospi_16_64] "r" (cospi_16_64) + ); + + step1_8 = step2_8 + step2_11; + step1_9 = step2_9 + step2_10; + step1_14 = step2_13 + step2_14; + step1_15 = step2_12 + step2_15; + + __asm__ __volatile__ ( + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step1_15] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step1_14] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[load5], %[step1_9] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step1_8] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step1_8] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step1_9] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step1_14] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step1_15] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7), + [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), + [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), + [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), + [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), + [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), + [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), + [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), + [step1_14] "r" (step1_14), [step1_15] "r" (step1_15) + ); + + input += 16; + } +} + +void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // First transform rows + idct16_1d_rows_dspr2(input, out, 16); + + // Then transform columns and add to dest + idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); +} + +static void iadst16_1d(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; + + int x0 = input[15]; + int x1 = input[0]; + int x2 = input[13]; + int x3 = input[2]; + int x4 = input[11]; + int x5 = input[4]; + int x6 = input[9]; + int x7 = input[6]; + int x8 = input[7]; + int x9 = input[8]; + int x10 = input[5]; + int x11 = input[10]; + int x12 = input[3]; + int x13 = input[12]; + int x14 = input[1]; + int x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 + | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = output[8] + = output[9] = output[10] = output[11] = output[12] + = output[13] = output[14] = output[15] = 0; + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = dct_const_round_shift(s0 + s8); + x1 = dct_const_round_shift(s1 + s9); + x2 = dct_const_round_shift(s2 + s10); + x3 = dct_const_round_shift(s3 + s11); + x4 = dct_const_round_shift(s4 + s12); + x5 = dct_const_round_shift(s5 + s13); + x6 = dct_const_round_shift(s6 + s14); + x7 = dct_const_round_shift(s7 + s15); + x8 = dct_const_round_shift(s0 - s8); + x9 = dct_const_round_shift(s1 - s9); + x10 = dct_const_round_shift(s2 - s10); + x11 = dct_const_round_shift(s3 - s11); + x12 = dct_const_round_shift(s4 - s12); + x13 = dct_const_round_shift(s5 - s13); + x14 = dct_const_round_shift(s6 - s14); + x15 = dct_const_round_shift(s7 - s15); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = s0 - s4; + x5 = s1 - s5; + x6 = s2 - s6; + x7 = s3 - s7; + x8 = dct_const_round_shift(s8 + s12); + x9 = dct_const_round_shift(s9 + s13); + x10 = dct_const_round_shift(s10 + s14); + x11 = dct_const_round_shift(s11 + s15); + x12 = dct_const_round_shift(s8 - s12); + x13 = dct_const_round_shift(s9 - s13); + x14 = dct_const_round_shift(s10 - s14); + x15 = dct_const_round_shift(s11 - s15); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + x8 = s8 + s10; + x9 = s9 + s11; + x10 = s8 - s10; + x11 = s9 - s11; + x12 = dct_const_round_shift(s12 + s14); + x13 = dct_const_round_shift(s13 + s15); + x14 = dct_const_round_shift(s12 - s14); + x15 = dct_const_round_shift(s13 - s15); + + // stage 4 + s2 = (- cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (- x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (- x10 + x11); + s14 = (- cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + x10 = dct_const_round_shift(s10); + x11 = dct_const_round_shift(s11); + x14 = dct_const_round_shift(s14); + x15 = dct_const_round_shift(s15); + + output[0] = x0; + output[1] = -x8; + output[2] = x12; + output[3] = -x4; + output[4] = x6; + output[5] = x14; + output[6] = x10; + output[7] = x2; + output[8] = x3; + output[9] = x11; + output[10] = x15; + output[11] = x7; + output[12] = x5; + output[13] = -x13; + output[14] = x9; + output[15] = -x1; +} + +void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, + int pitch, int tx_type) { + int i, j; + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + int16_t *outptr = out; + int16_t temp_out[16]; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + switch (tx_type) { + case DCT_DCT: // DCT in both horizontal and vertical + idct16_1d_rows_dspr2(input, outptr, 16); + idct16_1d_cols_add_blk_dspr2(out, dest, pitch); + break; + case ADST_DCT: // ADST in vertical, DCT in horizontal + idct16_1d_rows_dspr2(input, outptr, 16); + + outptr = out; + + for (i = 0; i < 16; ++i) { + iadst16_1d(outptr, temp_out); + + for (j = 0; j < 16; ++j) + dest[j * pitch + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * pitch + i]); + outptr += 16; + } + break; + case DCT_ADST: // DCT in vertical, ADST in horizontal + { + int16_t temp_in[16 * 16]; + + for (i = 0; i < 16; ++i) { + /* prefetch row */ + vp9_prefetch_load((const uint8_t *)(input + 16)); + + iadst16_1d(input, outptr); + input += 16; + outptr += 16; + } + + for (i = 0; i < 16; ++i) + for (j = 0; j < 16; ++j) + temp_in[j * 16 + i] = out[i * 16 + j]; + + idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch); + } + break; + case ADST_ADST: // ADST in both directions + { + int16_t temp_in[16]; + + for (i = 0; i < 16; ++i) { + /* prefetch row */ + vp9_prefetch_load((const uint8_t *)(input + 16)); + + iadst16_1d(input, outptr); + input += 16; + outptr += 16; + } + + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + iadst16_1d(temp_in, temp_out); + for (j = 0; j < 16; ++j) + dest[j * pitch + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * pitch + i]); + } + } + break; + default: + printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); + break; + } +} + +void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + int16_t *outptr = out; + uint32_t i; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + idct16_1d_rows_dspr2(input, outptr, 4); + + outptr += 4; + for (i = 0; i < 6; ++i) { + __asm__ __volatile__ ( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 128(%[outptr]) \n\t" + "sw $zero, 160(%[outptr]) \n\t" + "sw $zero, 192(%[outptr]) \n\t" + "sw $zero, 224(%[outptr]) \n\t" + "sw $zero, 256(%[outptr]) \n\t" + "sw $zero, 288(%[outptr]) \n\t" + "sw $zero, 320(%[outptr]) \n\t" + "sw $zero, 352(%[outptr]) \n\t" + "sw $zero, 384(%[outptr]) \n\t" + "sw $zero, 416(%[outptr]) \n\t" + "sw $zero, 448(%[outptr]) \n\t" + "sw $zero, 480(%[outptr]) \n\t" + + : + : [outptr] "r" (outptr) + ); + + outptr += 2; + } + + // Then transform columns + idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + + : + : [pos] "r" (pos) + ); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__ ( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r" (out), [a1] "=r" (a1) + : + ); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 16; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 16; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c new file mode 100644 index 000000000..5e92db3d2 --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c @@ -0,0 +1,1073 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; + int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26; + int16_t step1_27, step1_28, step1_29, step1_30, step1_31; + int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int16_t step2_28, step2_29, step2_30, step2_31; + int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27; + int16_t step3_28, step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int i, temp21; + uint8_t *dest_pix, *dest_pix1; + const int const_2_power_13 = 8192; + uint8_t *cm = vp9_ff_cropTbl; + + /* prefetch vp9_ff_cropTbl */ + vp9_prefetch_load(vp9_ff_cropTbl); + vp9_prefetch_load(vp9_ff_cropTbl + 32); + vp9_prefetch_load(vp9_ff_cropTbl + 64); + vp9_prefetch_load(vp9_ff_cropTbl + 96); + vp9_prefetch_load(vp9_ff_cropTbl + 128); + vp9_prefetch_load(vp9_ff_cropTbl + 160); + vp9_prefetch_load(vp9_ff_cropTbl + 192); + vp9_prefetch_load(vp9_ff_cropTbl + 224); + + for (i = 0; i < 32; ++i) { + dest_pix = dest + i; + dest_pix1 = dest + i + 31 * dest_stride; + + __asm__ __volatile__ ( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), + [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), + [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), + [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), + [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), + [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), + [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), + [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), + [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), + [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), + [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), + [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), + [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), + [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), + [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), + [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), + [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) + ); + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), + [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), + [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), + [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) + : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8), + [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), + [step2_11] "r" (step2_11), [step2_12] "r" (step2_12), + [step2_13] "r" (step2_13), [step2_14] "r" (step2_14), + [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64) + ); + + step2_18 = step1_17 - step1_18; + step2_29 = step1_30 - step1_29; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + + : [step3_18] "=r" (step3_18) + : [const_2_power_13] "r" (const_2_power_13), + [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; + step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_19 = step1_16 - step1_19; + step2_28 = step1_31 - step1_28; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + + : [step3_19] "=r" (step3_19) + : [const_2_power_13] "r" (const_2_power_13), + [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; + step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_16 = step1_16 + step1_19; + step3_17 = step1_17 + step1_18; + step3_30 = step1_29 + step1_30; + step3_31 = step1_28 + step1_31; + + step2_20 = step1_23 - step1_20; + step2_27 = step1_24 - step1_27; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" + "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac0, 31 \n\t" + + : [step3_20] "=r" (step3_20) + : [const_2_power_13] "r" (const_2_power_13), + [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; + step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_21 = step1_22 - step1_21; + step2_26 = step1_25 - step1_26; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" + "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac1, 31 \n\t" + + : [step3_21] "=r" (step3_21) + : [const_2_power_13] "r" (const_2_power_13), + [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; + step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_22 = step1_21 + step1_22; + step3_23 = step1_20 + step1_23; + step3_24 = step1_24 + step1_27; + step3_25 = step1_25 + step1_26; + + step2_16 = step3_16 + step3_23; + step2_17 = step3_17 + step3_22; + step2_18 = step3_18 + step3_21; + step2_19 = step3_19 + step3_20; + step2_20 = step3_19 - step3_20; + step2_21 = step3_18 - step3_21; + step2_22 = step3_17 - step3_22; + step2_23 = step3_16 - step3_23; + + step2_24 = step3_31 - step3_24; + step2_25 = step3_30 - step3_25; + step2_26 = step3_29 - step3_26; + step2_27 = step3_28 - step3_27; + step2_28 = step3_28 + step3_27; + step2_29 = step3_29 + step3_26; + step2_30 = step3_30 + step3_25; + step2_31 = step3_31 + step3_24; + + __asm__ __volatile__ ( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), + [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), + [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + step2_0 = step1_0 + step1_7; + step2_1 = step1_1 + step1_6; + step2_2 = step1_2 + step1_5; + step2_3 = step1_3 + step1_4; + step2_4 = step1_3 - step1_4; + step2_5 = step1_2 - step1_5; + step2_6 = step1_1 - step1_6; + step2_7 = step1_0 - step1_7; + + // stage 7 + step1_0 = step2_0 + step3_15; + step1_1 = step2_1 + step3_14; + step1_2 = step2_2 + step3_13; + step1_3 = step2_3 + step3_12; + step1_4 = step2_4 + step3_11; + step1_5 = step2_5 + step3_10; + step1_6 = step2_6 + step3_9; + step1_7 = step2_7 + step3_8; + step1_8 = step2_7 - step3_8; + step1_9 = step2_6 - step3_9; + step1_10 = step2_5 - step3_10; + step1_11 = step2_4 - step3_11; + step1_12 = step2_3 - step3_12; + step1_13 = step2_2 - step3_13; + step1_14 = step2_1 - step3_14; + step1_15 = step2_0 - step3_15; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) + : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20), + [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_20 + step2_27) * cospi_16_64; + step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_21], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) + : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26), + [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_21 + step2_26) * cospi_16_64; + step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) + : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25), + [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_22 + step2_25) * cospi_16_64; + step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_23], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) + : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24), + [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_23 + step2_24) * cospi_16_64; + step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_0], %[step2_31] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_1], %[step2_30] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_2], %[step2_29] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_3], %[step2_28] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), + [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), + [step2_28] "r" (step2_28), [step2_29] "r" (step2_29), + [step2_30] "r" (step2_30), [step2_31] "r" (step2_31) + ); + + step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), + [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) + ); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_4], %[step1_27] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_5], %[step1_26] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_6], %[step1_25] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_7], %[step1_24] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), + [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), + [step1_24] "r" (step1_24), [step1_25] "r" (step1_25), + [step1_26] "r" (step1_26), [step1_27] "r" (step1_27) + ); + + step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), + [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) + ); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_8], %[step1_23] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_9], %[step1_22] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_10], %[step1_21] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_11], %[step1_20] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), + [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), + [step1_20] "r" (step1_20), [step1_21] "r" (step1_21), + [step1_22] "r" (step1_22), [step1_23] "r" (step1_23) + ); + + step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), + [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) + ); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_12], %[step2_19] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_13], %[step2_18] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_14], %[step2_17] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_15], %[step2_16] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), + [step1_14] "r" (step1_14), [step1_15] "r" (step1_15), + [step2_16] "r" (step2_16), [step2_17] "r" (step2_17), + [step2_18] "r" (step2_18), [step2_19] "r" (step2_19) + ); + + step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), + [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) + ); + + input += 32; + } +} +#endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c new file mode 100644 index 000000000..d3aee73cb --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c @@ -0,0 +1,1013 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) { + int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; + int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; + int16_t step1_28, step1_29, step1_30, step1_31; + int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int16_t step2_28, step2_29, step2_30, step2_31; + int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; + int16_t step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int temp21; + int i; + const int const_2_power_13 = 8192; + const int32_t *input_int; + + for (i = 32; i--; ) { + input_int = (const int32_t *)input; + + if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | + input_int[4] | input_int[5] | input_int[6] | input_int[7] | + input_int[8] | input_int[9] | input_int[10] | input_int[11] | + input_int[12] | input_int[13] | input_int[14] | input_int[15])) { + input += 32; + + __asm__ __volatile__ ( + "sh $zero, 0(%[output]) \n\t" + "sh $zero, 64(%[output]) \n\t" + "sh $zero, 128(%[output]) \n\t" + "sh $zero, 192(%[output]) \n\t" + "sh $zero, 256(%[output]) \n\t" + "sh $zero, 320(%[output]) \n\t" + "sh $zero, 384(%[output]) \n\t" + "sh $zero, 448(%[output]) \n\t" + "sh $zero, 512(%[output]) \n\t" + "sh $zero, 576(%[output]) \n\t" + "sh $zero, 640(%[output]) \n\t" + "sh $zero, 704(%[output]) \n\t" + "sh $zero, 768(%[output]) \n\t" + "sh $zero, 832(%[output]) \n\t" + "sh $zero, 896(%[output]) \n\t" + "sh $zero, 960(%[output]) \n\t" + "sh $zero, 1024(%[output]) \n\t" + "sh $zero, 1088(%[output]) \n\t" + "sh $zero, 1152(%[output]) \n\t" + "sh $zero, 1216(%[output]) \n\t" + "sh $zero, 1280(%[output]) \n\t" + "sh $zero, 1344(%[output]) \n\t" + "sh $zero, 1408(%[output]) \n\t" + "sh $zero, 1472(%[output]) \n\t" + "sh $zero, 1536(%[output]) \n\t" + "sh $zero, 1600(%[output]) \n\t" + "sh $zero, 1664(%[output]) \n\t" + "sh $zero, 1728(%[output]) \n\t" + "sh $zero, 1792(%[output]) \n\t" + "sh $zero, 1856(%[output]) \n\t" + "sh $zero, 1920(%[output]) \n\t" + "sh $zero, 1984(%[output]) \n\t" + + : + : [output] "r" (output) + ); + + output += 1; + + continue; + } + + /* prefetch row */ + vp9_prefetch_load((const uint8_t *)(input + 32)); + vp9_prefetch_load((const uint8_t *)(input + 48)); + + __asm__ __volatile__ ( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), + [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), + [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), + [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), + [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), + [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), + [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), + [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), + [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), + [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), + [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), + [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), + [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), + [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), + [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), + [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), + [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) + ); + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), + [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), + [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), + [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) + : [const_2_power_13] "r" (const_2_power_13), + [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), + [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), + [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), + [step2_14] "r" (step2_14), [step2_15] "r" (step2_15), + [cospi_16_64] "r" (cospi_16_64) + ); + + step2_18 = step1_17 - step1_18; + step2_29 = step1_30 - step1_29; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + + : [step3_18] "=r" (step3_18) + : [const_2_power_13] "r" (const_2_power_13), + [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; + step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_19 = step1_16 - step1_19; + step2_28 = step1_31 - step1_28; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + + : [step3_19] "=r" (step3_19) + : [const_2_power_13] "r" (const_2_power_13), + [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; + step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_16 = step1_16 + step1_19; + step3_17 = step1_17 + step1_18; + step3_30 = step1_29 + step1_30; + step3_31 = step1_28 + step1_31; + + step2_20 = step1_23 - step1_20; + step2_27 = step1_24 - step1_27; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" + "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac0, 31 \n\t" + + : [step3_20] "=r" (step3_20) + : [const_2_power_13] "r" (const_2_power_13), + [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; + step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_21 = step1_22 - step1_21; + step2_26 = step1_25 - step1_26; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" + "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac1, 31 \n\t" + + : [step3_21] "=r" (step3_21) + : [const_2_power_13] "r" (const_2_power_13), + [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; + step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_22 = step1_21 + step1_22; + step3_23 = step1_20 + step1_23; + step3_24 = step1_24 + step1_27; + step3_25 = step1_25 + step1_26; + + step2_16 = step3_16 + step3_23; + step2_17 = step3_17 + step3_22; + step2_18 = step3_18 + step3_21; + step2_19 = step3_19 + step3_20; + step2_20 = step3_19 - step3_20; + step2_21 = step3_18 - step3_21; + step2_22 = step3_17 - step3_22; + step2_23 = step3_16 - step3_23; + + step2_24 = step3_31 - step3_24; + step2_25 = step3_30 - step3_25; + step2_26 = step3_29 - step3_26; + step2_27 = step3_28 - step3_27; + step2_28 = step3_28 + step3_27; + step2_29 = step3_29 + step3_26; + step2_30 = step3_30 + step3_25; + step2_31 = step3_31 + step3_24; + + __asm__ __volatile__ ( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), + [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_16_64] "r" (cospi_16_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + + ); + + __asm__ __volatile__ ( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), + [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + step2_0 = step1_0 + step1_7; + step2_1 = step1_1 + step1_6; + step2_2 = step1_2 + step1_5; + step2_3 = step1_3 + step1_4; + step2_4 = step1_3 - step1_4; + step2_5 = step1_2 - step1_5; + step2_6 = step1_1 - step1_6; + step2_7 = step1_0 - step1_7; + + step1_0 = step2_0 + step3_15; + step1_1 = step2_1 + step3_14; + step1_2 = step2_2 + step3_13; + step1_3 = step2_3 + step3_12; + step1_4 = step2_4 + step3_11; + step1_5 = step2_5 + step3_10; + step1_6 = step2_6 + step3_9; + step1_7 = step2_7 + step3_8; + step1_8 = step2_7 - step3_8; + step1_9 = step2_6 - step3_9; + step1_10 = step2_5 - step3_10; + step1_11 = step2_4 - step3_11; + step1_12 = step2_3 - step3_12; + step1_13 = step2_2 - step3_13; + step1_14 = step2_1 - step3_14; + step1_15 = step2_0 - step3_15; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) + : [const_2_power_13] "r" (const_2_power_13), + [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), + [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_20 + step2_27) * cospi_16_64; + step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_21], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) + : [const_2_power_13] "r" (const_2_power_13), + [step2_26] "r" (step2_26), [step2_21] "r" (step2_21), + [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_21 + step2_26) * cospi_16_64; + step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) + : [const_2_power_13] "r" (const_2_power_13), + [step2_25] "r" (step2_25), [step2_22] "r" (step2_22), + [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_22 + step2_25) * cospi_16_64; + step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_23], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) + : [const_2_power_13] "r" (const_2_power_13), + [step2_24] "r" (step2_24), [step2_23] "r" (step2_23), + [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_23 + step2_24) * cospi_16_64; + step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + // final stage + output[0 * 32] = step1_0 + step2_31; + output[1 * 32] = step1_1 + step2_30; + output[2 * 32] = step1_2 + step2_29; + output[3 * 32] = step1_3 + step2_28; + output[4 * 32] = step1_4 + step1_27; + output[5 * 32] = step1_5 + step1_26; + output[6 * 32] = step1_6 + step1_25; + output[7 * 32] = step1_7 + step1_24; + output[8 * 32] = step1_8 + step1_23; + output[9 * 32] = step1_9 + step1_22; + output[10 * 32] = step1_10 + step1_21; + output[11 * 32] = step1_11 + step1_20; + output[12 * 32] = step1_12 + step2_19; + output[13 * 32] = step1_13 + step2_18; + output[14 * 32] = step1_14 + step2_17; + output[15 * 32] = step1_15 + step2_16; + output[16 * 32] = step1_15 - step2_16; + output[17 * 32] = step1_14 - step2_17; + output[18 * 32] = step1_13 - step2_18; + output[19 * 32] = step1_12 - step2_19; + output[20 * 32] = step1_11 - step1_20; + output[21 * 32] = step1_10 - step1_21; + output[22 * 32] = step1_9 - step1_22; + output[23 * 32] = step1_8 - step1_23; + output[24 * 32] = step1_7 - step1_24; + output[25 * 32] = step1_6 - step1_25; + output[26 * 32] = step1_5 - step1_26; + output[27 * 32] = step1_4 - step1_27; + output[28 * 32] = step1_3 - step2_28; + output[29 * 32] = step1_2 - step2_29; + output[30 * 32] = step1_1 - step2_30; + output[31 * 32] = step1_0 - step2_31; + + input += 32; + output += 1; + } +} + +void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // Rows + idct32_1d_rows_dspr2(input, outptr); + + // Columns + vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + int r, out; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + + : + : [pos] "r" (pos) + ); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__ ( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r" (out), [a1] "=r" (a1) + : + ); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 32; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), + [dest] "+&r" (dest) + : [stride] "r" (stride), [vector_a1] "r" (vector_a1) + ); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 32; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), + [dest] "+&r" (dest) + : [stride] "r" (stride), [vector_a1] "r" (vector_a1) + ); + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c new file mode 100644 index 000000000..5b7aa5e71 --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) { + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + int i; + + for (i = 4; i--; ) { + __asm__ __volatile__ ( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + + "add %[Temp1], %[step_1], %[step_2] \n\t" + "sh %[Temp1], 8(%[output]) \n\t" + + "sub %[Temp2], %[step_1], %[step_2] \n\t" + "sh %[Temp2], 16(%[output]) \n\t" + + "sub %[Temp3], %[step_0], %[step_3] \n\t" + "sh %[Temp3], 24(%[output]) \n\t" + + : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), + [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [step_0] "=&r" (step_0), [step_1] "=&r" (step_1), + [step_2] "=&r" (step_2), [step_3] "=&r" (step_3), + [output] "+r" (output) + : [const_2_power_13] "r" (const_2_power_13), + [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), + [cospi_24_64] "r" (cospi_24_64), + [input] "r" (input) + ); + + input += 4; + output += 1; + } +} + +static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + int i; + uint8_t *dest_pix; + uint8_t *cm = vp9_ff_cropTbl; + + /* prefetch vp9_ff_cropTbl */ + vp9_prefetch_load(vp9_ff_cropTbl); + vp9_prefetch_load(vp9_ff_cropTbl + 32); + vp9_prefetch_load(vp9_ff_cropTbl + 64); + vp9_prefetch_load(vp9_ff_cropTbl + 96); + vp9_prefetch_load(vp9_ff_cropTbl + 128); + vp9_prefetch_load(vp9_ff_cropTbl + 160); + vp9_prefetch_load(vp9_ff_cropTbl + 192); + vp9_prefetch_load(vp9_ff_cropTbl + 224); + + for (i = 0; i < 4; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__ ( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step_1], %[step_2] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step_1], %[step_2] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step_0], %[step_3] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + + : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), + [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [step_0] "=&r" (step_0), [step_1] "=&r" (step_1), + [step_2] "=&r" (step_2), [step_3] "=&r" (step_3), + [dest_pix] "+r" (dest_pix) + : [const_2_power_13] "r" (const_2_power_13), + [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), + [cospi_24_64] "r" (cospi_24_64), + [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) + ); + + input += 4; + } +} + +void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[4 * 4]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // Rows + vp9_idct4_1d_rows_dspr2(input, outptr); + + // Columns + vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + int a1, absa1; + int r; + int32_t out; + int t2, vector_a1, vector_a; + uint32_t pos = 45; + int16_t input_dc = input[0]; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + + : + : [pos] "r" (pos) + ); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc); + __asm__ __volatile__ ( + "addi %[out], %[out], 8 \n\t" + "sra %[a1], %[out], 4 \n\t" + + : [out] "+r" (out), [a1] "=r" (a1) + : + ); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 4; r--;) { + __asm__ __volatile__ ( + "lw %[t2], 0(%[dest]) \n\t" + "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "replv.qb %[vector_a1], %[a1] \n\t" + : [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 4; r--;) { + __asm__ __volatile__ ( + "lw %[t2], 0(%[dest]) \n\t" + "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } +} + +static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3; + + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = x0 - x2 + x3; + + x0 = s0 + s3 + s5; + x1 = s1 - s4 - s6; + x2 = sinpi_3_9 * s7; + x3 = s2; + + s0 = x0 + x3; + s1 = x1 + x3; + s2 = x2; + s3 = x0 + x1 - x3; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = dct_const_round_shift(s0); + output[1] = dct_const_round_shift(s1); + output[2] = dct_const_round_shift(s2); + output[3] = dct_const_round_shift(s3); +} + +void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride, int tx_type) { + int i, j; + DECLARE_ALIGNED(32, int16_t, out[4 * 4]); + int16_t *outptr = out; + int16_t temp_in[4 * 4], temp_out[4]; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + switch (tx_type) { + case DCT_DCT: // DCT in both horizontal and vertical + vp9_idct4_1d_rows_dspr2(input, outptr); + vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + break; + case ADST_DCT: // ADST in vertical, DCT in horizontal + vp9_idct4_1d_rows_dspr2(input, outptr); + + outptr = out; + + for (i = 0; i < 4; ++i) { + iadst4_1d_dspr2(outptr, temp_out); + + for (j = 0; j < 4; ++j) + dest[j * dest_stride + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * dest_stride + i]); + + outptr += 4; + } + break; + case DCT_ADST: // DCT in vertical, ADST in horizontal + for (i = 0; i < 4; ++i) { + iadst4_1d_dspr2(input, outptr); + input += 4; + outptr += 4; + } + + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) { + temp_in[i * 4 + j] = out[j * 4 + i]; + } + } + vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + break; + case ADST_ADST: // ADST in both directions + for (i = 0; i < 4; ++i) { + iadst4_1d_dspr2(input, outptr); + input += 4; + outptr += 4; + } + + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + iadst4_1d_dspr2(temp_in, temp_out); + + for (j = 0; j < 4; ++j) + dest[j * dest_stride + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * dest_stride + i]); + } + break; + default: + printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n"); + break; + } +} +#endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c new file mode 100644 index 000000000..93a08401d --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c @@ -0,0 +1,745 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + const int const_2_power_13 = 8192; + int Temp0, Temp1, Temp2, Temp3, Temp4; + int i; + + for (i = no_rows; i--; ) { + __asm__ __volatile__ ( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[Temp4], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[Temp4], %[Temp1] \n\t" + "sub %[step1_3], %[Temp4], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + "add %[Temp1], %[step1_1], %[step1_6] \n\t" + "sh %[Temp1], 16(%[output]) \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "sh %[Temp0], 32(%[output]) \n\t" + "add %[Temp1], %[step1_3], %[step1_4] \n\t" + "sh %[Temp1], 48(%[output]) \n\t" + + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "sh %[Temp0], 64(%[output]) \n\t" + "sub %[Temp1], %[step1_2], %[step1_5] \n\t" + "sh %[Temp1], 80(%[output]) \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "sh %[Temp0], 96(%[output]) \n\t" + "sub %[Temp1], %[step1_0], %[step1_7] \n\t" + "sh %[Temp1], 112(%[output]) \n\t" + + : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), + [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), + [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), + [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), + [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), + [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [Temp4] "=&r" (Temp4) + : [const_2_power_13] "r" (const_2_power_13), + [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_24_64] "r" (cospi_24_64), + [output] "r" (output), [input] "r" (input) + ); + + input += 8; + output += 1; + } +} + +static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int Temp0, Temp1, Temp2, Temp3; + int i; + const int const_2_power_13 = 8192; + uint8_t *dest_pix; + uint8_t *cm = vp9_ff_cropTbl; + + /* prefetch vp9_ff_cropTbl */ + vp9_prefetch_load(vp9_ff_cropTbl); + vp9_prefetch_load(vp9_ff_cropTbl + 32); + vp9_prefetch_load(vp9_ff_cropTbl + 64); + vp9_prefetch_load(vp9_ff_cropTbl + 96); + vp9_prefetch_load(vp9_ff_cropTbl + 128); + vp9_prefetch_load(vp9_ff_cropTbl + 160); + vp9_prefetch_load(vp9_ff_cropTbl + 192); + vp9_prefetch_load(vp9_ff_cropTbl + 224); + + for (i = 0; i < 8; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__ ( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[step1_6], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[step1_6], %[Temp1] \n\t" + "sub %[step1_3], %[step1_6], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* add block */ + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_1], %[step1_6] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_3], %[step1_4] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_2], %[step1_5] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_0], %[step1_7] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + + : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), + [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), + [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), + [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), + [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), + [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dest_pix] "+r" (dest_pix) + : [const_2_power_13] "r" (const_2_power_13), + [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_24_64] "r" (cospi_24_64), + [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) + ); + + input += 8; + } +} + +void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // First transform rows + idct8_1d_rows_dspr2(input, outptr, 8); + + // Then transform columns and add to dest + idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3, x4, x5, x6, x7; + + x0 = input[7]; + x1 = input[0]; + x2 = input[5]; + x3 = input[2]; + x4 = input[3]; + x5 = input[4]; + x6 = input[1]; + x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); + x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); + x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); + x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); + + output[0] = x0; + output[1] = -x4; + output[2] = x6; + output[3] = -x2; + output[4] = x3; + output[5] = -x7; + output[6] = x5; + output[7] = -x1; +} + +void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride, int tx_type) { + int i, j; + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + int16_t temp_in[8 * 8], temp_out[8]; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + switch (tx_type) { + case DCT_DCT: // DCT in both horizontal and vertical + idct8_1d_rows_dspr2(input, outptr, 8); + idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + break; + case ADST_DCT: // ADST in vertical, DCT in horizontal + idct8_1d_rows_dspr2(input, outptr, 8); + + for (i = 0; i < 8; ++i) { + iadst8_1d_dspr2(&out[i * 8], temp_out); + + for (j = 0; j < 8; ++j) + dest[j * dest_stride + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * dest_stride + i]); + } + break; + case DCT_ADST: // DCT in vertical, ADST in horizontal + for (i = 0; i < 8; ++i) { + iadst8_1d_dspr2(input, outptr); + input += 8; + outptr += 8; + } + + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) { + temp_in[i * 8 + j] = out[j * 8 + i]; + } + } + idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + break; + case ADST_ADST: // ADST in both directions + for (i = 0; i < 8; ++i) { + iadst8_1d_dspr2(input, outptr); + input += 8; + outptr += 8; + } + + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + + iadst8_1d_dspr2(temp_in, temp_out); + + for (j = 0; j < 8; ++j) + dest[j * dest_stride + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * dest_stride + i]); + } + break; + default: + printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); + break; + } +} + +void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // First transform rows + idct8_1d_rows_dspr2(input, outptr, 4); + + outptr += 4; + + __asm__ __volatile__ ( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 48(%[outptr]) \n\t" + "sw $zero, 52(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 68(%[outptr]) \n\t" + "sw $zero, 80(%[outptr]) \n\t" + "sw $zero, 84(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 100(%[outptr]) \n\t" + "sw $zero, 112(%[outptr]) \n\t" + "sw $zero, 116(%[outptr]) \n\t" + + : + : [outptr] "r" (outptr) + ); + + + // Then transform columns and add to dest + idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t t1, t2, vector_a1, vector_1, vector_2; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + + : + : [pos] "r" (pos) + ); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__ ( + "addi %[out], %[out], 16 \n\t" + "sra %[a1], %[out], 5 \n\t" + + : [out] "+r" (out), [a1] "=r" (a1) + : + ); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 8; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 8; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [dest] "+r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 72b2126da..f9c16a5a8 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -268,43 +268,43 @@ specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2 # dct # prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct4x4_1_add sse2 neon +specialize vp9_idct4x4_1_add sse2 neon dspr2 prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct4x4_16_add sse2 neon +specialize vp9_idct4x4_16_add sse2 neon dspr2 prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct8x8_1_add sse2 neon +specialize vp9_idct8x8_1_add sse2 neon dspr2 prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct8x8_64_add sse2 neon +specialize vp9_idct8x8_64_add sse2 neon dspr2 prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct8x8_10_add sse2 neon +specialize vp9_idct8x8_10_add sse2 neon dspr2 prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct16x16_1_add sse2 neon +specialize vp9_idct16x16_1_add sse2 neon dspr2 prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct16x16_256_add sse2 neon +specialize vp9_idct16x16_256_add sse2 neon dspr2 prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct16x16_10_add sse2 neon +specialize vp9_idct16x16_10_add sse2 neon dspr2 prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct32x32_1024_add sse2 neon +specialize vp9_idct32x32_1024_add sse2 neon dspr2 prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct32x32_1_add sse2 +specialize vp9_idct32x32_1_add sse2 dspr2 prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_iht4x4_16_add sse2 neon +specialize vp9_iht4x4_16_add sse2 neon dspr2 prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_iht8x8_64_add sse2 neon +specialize vp9_iht8x8_64_add sse2 neon dspr2 prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type" -specialize vp9_iht16x16_256_add sse2 +specialize vp9_iht16x16_256_add sse2 dspr2 # dct and add diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index af6e66538..11fa2e0c0 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -102,6 +102,11 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_vert_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_cols_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_dspr2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c -- 2.40.0