From: Debargha Mukherjee Date: Fri, 2 Oct 2015 18:06:55 +0000 (-0700) Subject: Reimplementatio of dst1 for speed X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ff9aa146cb5e4bfc771e04c701bcae7b7d92ceaa;p=libvpx Reimplementatio of dst1 for speed Encoder with --enable-ext-tx --enable-dst1 is now 4 times faster. Change-Id: Ia750ad3516698ce94da4ceb566b1c51539537a95 --- diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index a1af5c9f3..bc57a0645 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -17,52 +17,333 @@ #if CONFIG_EXT_TX #if CONFIG_DST1 -static INLINE void idst_core(const tran_low_t *input, tran_low_t *output, - const int N, const int32_t *dst_lookup, - int bd) { - int i, j; - (void) bd; - for (i = 0; i < N; i++) { - int64_t sum = 0; - for (j = 0; j < N; j++) { - int idx = (i + 1) * (j + 1); - int sign = 0; - if (idx > N + 1) { - sign = (idx / (N + 1)) & 1; - idx %= (N + 1); - } - idx = MIN(idx, N + 1 - idx); - if (idx == 0) continue; - sum += (int64_t)input[j] * dst_lookup[idx - 1] * (sign ? -1 : 1); - } - sum = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); - output[i] = WRAPLOW(sum, bd); - } -} - void idst4(const tran_low_t *input, tran_low_t *output) { - idst_core(input, output, 4, dst_lookup4, 8); + // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2) + static const int32_t sinvalue_lookup[] = { + 141124871, 228344838, + }; + int64_t sum; + int64_t s03 = (input[0] + input[3]); + int64_t d03 = (input[0] - input[3]); + int64_t s12 = (input[1] + input[2]); + int64_t d12 = (input[1] - input[2]); + sum = s03 * sinvalue_lookup[0] + s12 * sinvalue_lookup[1]; + output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = d03 * sinvalue_lookup[1] + d12 * sinvalue_lookup[0]; + output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = s03 * sinvalue_lookup[1] - s12 * sinvalue_lookup[0]; + output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = d03 * sinvalue_lookup[0] - d12 * sinvalue_lookup[1]; + output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); } void idst8(const tran_low_t *input, tran_low_t *output) { - idst_core(input, output, 8, dst_lookup8, 8); + // {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2 + static const int32_t sinvalue_lookup[] = { + 86559612, 162678858, 219176632, 249238470 + }; + int64_t sum; + int64_t s07 = (input[0] + input[7]); + int64_t d07 = (input[0] - input[7]); + int64_t s16 = (input[1] + input[6]); + int64_t d16 = (input[1] - input[6]); + int64_t s25 = (input[2] + input[5]); + int64_t d25 = (input[2] - input[5]); + int64_t s34 = (input[3] + input[4]); + int64_t d34 = (input[3] - input[4]); + sum = s07 * sinvalue_lookup[0] + s16 * sinvalue_lookup[1] + + s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[3]; + output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = d07 * sinvalue_lookup[1] + d16 * sinvalue_lookup[3] + + d25 * sinvalue_lookup[2] + d34 * sinvalue_lookup[0]; + output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = (s07 + s16 - s34)* sinvalue_lookup[2]; + output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = d07 * sinvalue_lookup[3] + d16 * sinvalue_lookup[0] - + d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[1]; + output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = s07 * sinvalue_lookup[3] - s16 * sinvalue_lookup[0] - + s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[1]; + output[4] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = (d07 - d16 + d34)* sinvalue_lookup[2]; + output[5] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = s07 * sinvalue_lookup[1] - s16 * sinvalue_lookup[3] + + s25 * sinvalue_lookup[2] - s34 * sinvalue_lookup[0]; + output[6] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = d07 * sinvalue_lookup[0] - d16 * sinvalue_lookup[1] + + d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[3]; + output[7] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); } void idst16(const tran_low_t *input, tran_low_t *output) { - idst_core(input, output, 16, dst_lookup16, 8); + // {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2) + static const int32_t sinvalue_lookup[] = { + 47852167, 94074787, 137093803, 175444254, + 207820161, 233119001, 250479254, 259309736 + }; + int64_t sum; + int64_t s015 = (input[0] + input[15]); + int64_t d015 = (input[0] - input[15]); + int64_t s114 = (input[1] + input[14]); + int64_t d114 = (input[1] - input[14]); + int64_t s213 = (input[2] + input[13]); + int64_t d213 = (input[2] - input[13]); + int64_t s312 = (input[3] + input[12]); + int64_t d312 = (input[3] - input[12]); + int64_t s411 = (input[4] + input[11]); + int64_t d411 = (input[4] - input[11]); + int64_t s510 = (input[5] + input[10]); + int64_t d510 = (input[5] - input[10]); + int64_t s69 = (input[6] + input[9]); + int64_t d69 = (input[6] - input[9]); + int64_t s78 = (input[7] + input[8]); + int64_t d78 = (input[7] - input[8]); + sum = s015 * sinvalue_lookup[0] + s114 * sinvalue_lookup[1] + + s213 * sinvalue_lookup[2] + s312 * sinvalue_lookup[3] + + s411 * sinvalue_lookup[4] + s510 * sinvalue_lookup[5] + + s69 * sinvalue_lookup[6] + s78 * sinvalue_lookup[7]; + output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = d015 * sinvalue_lookup[1] + d114 * sinvalue_lookup[3] + + d213 * sinvalue_lookup[5] + d312 * sinvalue_lookup[7] + + d411 * sinvalue_lookup[6] + d510 * sinvalue_lookup[4] + + d69 * sinvalue_lookup[2] + d78 * sinvalue_lookup[0]; + output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = s015 * sinvalue_lookup[2] + s114 * sinvalue_lookup[5] + + s213 * sinvalue_lookup[7] + s312 * sinvalue_lookup[4] + + s411 * sinvalue_lookup[1] - s510 * sinvalue_lookup[0] - + s69 * sinvalue_lookup[3] - s78 * sinvalue_lookup[6]; + output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = d015 * sinvalue_lookup[3] + d114 * sinvalue_lookup[7] + + d213 * sinvalue_lookup[4] + d312 * sinvalue_lookup[0] - + d411 * sinvalue_lookup[2] - d510 * sinvalue_lookup[6] - + d69 * sinvalue_lookup[5] - d78 * sinvalue_lookup[1]; + output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = s015 * sinvalue_lookup[4] + s114 * sinvalue_lookup[6] + + s213 * sinvalue_lookup[1] - s312 * sinvalue_lookup[2] - + s411 * sinvalue_lookup[7] - s510 * sinvalue_lookup[3] + + s69 * sinvalue_lookup[0] + s78 * sinvalue_lookup[5]; + output[4] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = d015 * sinvalue_lookup[5] + d114 * sinvalue_lookup[4] - + d213 * sinvalue_lookup[0] - d312 * sinvalue_lookup[6] - + d411 * sinvalue_lookup[3] + d510 * sinvalue_lookup[1] + + d69 * sinvalue_lookup[7] + d78 * sinvalue_lookup[2]; + output[5] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = s015 * sinvalue_lookup[6] + s114 * sinvalue_lookup[2] - + s213 * sinvalue_lookup[3] - s312 * sinvalue_lookup[5] + + s411 * sinvalue_lookup[0] + s510 * sinvalue_lookup[7] + + s69 * sinvalue_lookup[1] - s78 * sinvalue_lookup[4]; + output[6] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = d015 * sinvalue_lookup[7] + d114 * sinvalue_lookup[0] - + d213 * sinvalue_lookup[6] - d312 * sinvalue_lookup[1] + + d411 * sinvalue_lookup[5] + d510 * sinvalue_lookup[2] - + d69 * sinvalue_lookup[4] - d78 * sinvalue_lookup[3]; + output[7] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = s015 * sinvalue_lookup[7] - s114 * sinvalue_lookup[0] - + s213 * sinvalue_lookup[6] + s312 * sinvalue_lookup[1] + + s411 * sinvalue_lookup[5] - s510 * sinvalue_lookup[2] - + s69 * sinvalue_lookup[4] + s78 * sinvalue_lookup[3]; + output[8] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = d015 * sinvalue_lookup[6] - d114 * sinvalue_lookup[2] - + d213 * sinvalue_lookup[3] + d312 * sinvalue_lookup[5] + + d411 * sinvalue_lookup[0] - d510 * sinvalue_lookup[7] + + d69 * sinvalue_lookup[1] + d78 * sinvalue_lookup[4]; + output[9] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = s015 * sinvalue_lookup[5] - s114 * sinvalue_lookup[4] - + s213 * sinvalue_lookup[0] + s312 * sinvalue_lookup[6] - + s411 * sinvalue_lookup[3] - s510 * sinvalue_lookup[1] + + s69 * sinvalue_lookup[7] - s78 * sinvalue_lookup[2]; + output[10] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = d015 * sinvalue_lookup[4] - d114 * sinvalue_lookup[6] + + d213 * sinvalue_lookup[1] + d312 * sinvalue_lookup[2] - + d411 * sinvalue_lookup[7] + d510 * sinvalue_lookup[3] + + d69 * sinvalue_lookup[0] - d78 * sinvalue_lookup[5]; + output[11] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = s015 * sinvalue_lookup[3] - s114 * sinvalue_lookup[7] + + s213 * sinvalue_lookup[4] - s312 * sinvalue_lookup[0] - + s411 * sinvalue_lookup[2] + s510 * sinvalue_lookup[6] - + s69 * sinvalue_lookup[5] + s78 * sinvalue_lookup[1]; + output[12] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = d015 * sinvalue_lookup[2] - d114 * sinvalue_lookup[5] + + d213 * sinvalue_lookup[7] - d312 * sinvalue_lookup[4] + + d411 * sinvalue_lookup[1] + d510 * sinvalue_lookup[0] - + d69 * sinvalue_lookup[3] + d78 * sinvalue_lookup[6]; + output[13] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = s015 * sinvalue_lookup[1] - s114 * sinvalue_lookup[3] + + s213 * sinvalue_lookup[5] - s312 * sinvalue_lookup[7] + + s411 * sinvalue_lookup[6] - s510 * sinvalue_lookup[4] + + s69 * sinvalue_lookup[2] - s78 * sinvalue_lookup[0]; + output[14] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); + sum = d015 * sinvalue_lookup[0] - d114 * sinvalue_lookup[1] + + d213 * sinvalue_lookup[2] - d312 * sinvalue_lookup[3] + + d411 * sinvalue_lookup[4] - d510 * sinvalue_lookup[5] + + d69 * sinvalue_lookup[6] - d78 * sinvalue_lookup[7]; + output[15] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8); } #if CONFIG_VP9_HIGHBITDEPTH void highbd_idst4(const tran_low_t *input, tran_low_t *output, int bd) { - idst_core(input, output, 4, dst_lookup4, bd); + // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2) + static const int32_t sinvalue_lookup[] = { + 141124871, 228344838, + }; + int64_t sum; + int64_t s03 = (input[0] + input[3]); + int64_t d03 = (input[0] - input[3]); + int64_t s12 = (input[1] + input[2]); + int64_t d12 = (input[1] - input[2]); + + sum = s03 * sinvalue_lookup[0] + s12 * sinvalue_lookup[1]; + output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = d03 * sinvalue_lookup[1] + d12 * sinvalue_lookup[0]; + output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = s03 * sinvalue_lookup[1] - s12 * sinvalue_lookup[0]; + output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = d03 * sinvalue_lookup[0] - d12 * sinvalue_lookup[1]; + output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); } void highbd_idst8(const tran_low_t *input, tran_low_t *output, int bd) { - idst_core(input, output, 8, dst_lookup8, bd); + // {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2 + static const int32_t sinvalue_lookup[] = { + 86559612, 162678858, 219176632, 249238470 + }; + int64_t sum; + int64_t s07 = (input[0] + input[7]); + int64_t d07 = (input[0] - input[7]); + int64_t s16 = (input[1] + input[6]); + int64_t d16 = (input[1] - input[6]); + int64_t s25 = (input[2] + input[5]); + int64_t d25 = (input[2] - input[5]); + int64_t s34 = (input[3] + input[4]); + int64_t d34 = (input[3] - input[4]); + + sum = s07 * sinvalue_lookup[0] + s16 * sinvalue_lookup[1] + + s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[3]; + output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = d07 * sinvalue_lookup[1] + d16 * sinvalue_lookup[3] + + d25 * sinvalue_lookup[2] + d34 * sinvalue_lookup[0]; + output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = (s07 + s16 - s34)* sinvalue_lookup[2]; + output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = d07 * sinvalue_lookup[3] + d16 * sinvalue_lookup[0] - + d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[1]; + output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = s07 * sinvalue_lookup[3] - s16 * sinvalue_lookup[0] - + s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[1]; + output[4] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = (d07 - d16 + d34)* sinvalue_lookup[2]; + output[5] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = s07 * sinvalue_lookup[1] - s16 * sinvalue_lookup[3] + + s25 * sinvalue_lookup[2] - s34 * sinvalue_lookup[0]; + output[6] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = d07 * sinvalue_lookup[0] - d16 * sinvalue_lookup[1] + + d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[3]; + output[7] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); } void highbd_idst16(const tran_low_t *input, tran_low_t *output, int bd) { - idst_core(input, output, 16, dst_lookup16, bd); + // {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2) + static const int32_t sinvalue_lookup[] = { + 47852167, 94074787, 137093803, 175444254, + 207820161, 233119001, 250479254, 259309736 + }; + int64_t sum; + int64_t s015 = (input[0] + input[15]); + int64_t d015 = (input[0] - input[15]); + int64_t s114 = (input[1] + input[14]); + int64_t d114 = (input[1] - input[14]); + int64_t s213 = (input[2] + input[13]); + int64_t d213 = (input[2] - input[13]); + int64_t s312 = (input[3] + input[12]); + int64_t d312 = (input[3] - input[12]); + int64_t s411 = (input[4] + input[11]); + int64_t d411 = (input[4] - input[11]); + int64_t s510 = (input[5] + input[10]); + int64_t d510 = (input[5] - input[10]); + int64_t s69 = (input[6] + input[9]); + int64_t d69 = (input[6] - input[9]); + int64_t s78 = (input[7] + input[8]); + int64_t d78 = (input[7] - input[8]); + sum = s015 * sinvalue_lookup[0] + s114 * sinvalue_lookup[1] + + s213 * sinvalue_lookup[2] + s312 * sinvalue_lookup[3] + + s411 * sinvalue_lookup[4] + s510 * sinvalue_lookup[5] + + s69 * sinvalue_lookup[6] + s78 * sinvalue_lookup[7]; + output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = d015 * sinvalue_lookup[1] + d114 * sinvalue_lookup[3] + + d213 * sinvalue_lookup[5] + d312 * sinvalue_lookup[7] + + d411 * sinvalue_lookup[6] + d510 * sinvalue_lookup[4] + + d69 * sinvalue_lookup[2] + d78 * sinvalue_lookup[0]; + output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = s015 * sinvalue_lookup[2] + s114 * sinvalue_lookup[5] + + s213 * sinvalue_lookup[7] + s312 * sinvalue_lookup[4] + + s411 * sinvalue_lookup[1] - s510 * sinvalue_lookup[0] - + s69 * sinvalue_lookup[3] - s78 * sinvalue_lookup[6]; + output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = d015 * sinvalue_lookup[3] + d114 * sinvalue_lookup[7] + + d213 * sinvalue_lookup[4] + d312 * sinvalue_lookup[0] - + d411 * sinvalue_lookup[2] - d510 * sinvalue_lookup[6] - + d69 * sinvalue_lookup[5] - d78 * sinvalue_lookup[1]; + output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = s015 * sinvalue_lookup[4] + s114 * sinvalue_lookup[6] + + s213 * sinvalue_lookup[1] - s312 * sinvalue_lookup[2] - + s411 * sinvalue_lookup[7] - s510 * sinvalue_lookup[3] + + s69 * sinvalue_lookup[0] + s78 * sinvalue_lookup[5]; + output[4] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = d015 * sinvalue_lookup[5] + d114 * sinvalue_lookup[4] - + d213 * sinvalue_lookup[0] - d312 * sinvalue_lookup[6] - + d411 * sinvalue_lookup[3] + d510 * sinvalue_lookup[1] + + d69 * sinvalue_lookup[7] + d78 * sinvalue_lookup[2]; + output[5] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = s015 * sinvalue_lookup[6] + s114 * sinvalue_lookup[2] - + s213 * sinvalue_lookup[3] - s312 * sinvalue_lookup[5] + + s411 * sinvalue_lookup[0] + s510 * sinvalue_lookup[7] + + s69 * sinvalue_lookup[1] - s78 * sinvalue_lookup[4]; + output[6] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = d015 * sinvalue_lookup[7] + d114 * sinvalue_lookup[0] - + d213 * sinvalue_lookup[6] - d312 * sinvalue_lookup[1] + + d411 * sinvalue_lookup[5] + d510 * sinvalue_lookup[2] - + d69 * sinvalue_lookup[4] - d78 * sinvalue_lookup[3]; + output[7] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = s015 * sinvalue_lookup[7] - s114 * sinvalue_lookup[0] - + s213 * sinvalue_lookup[6] + s312 * sinvalue_lookup[1] + + s411 * sinvalue_lookup[5] - s510 * sinvalue_lookup[2] - + s69 * sinvalue_lookup[4] + s78 * sinvalue_lookup[3]; + output[8] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = d015 * sinvalue_lookup[6] - d114 * sinvalue_lookup[2] - + d213 * sinvalue_lookup[3] + d312 * sinvalue_lookup[5] + + d411 * sinvalue_lookup[0] - d510 * sinvalue_lookup[7] + + d69 * sinvalue_lookup[1] + d78 * sinvalue_lookup[4]; + output[9] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = s015 * sinvalue_lookup[5] - s114 * sinvalue_lookup[4] - + s213 * sinvalue_lookup[0] + s312 * sinvalue_lookup[6] - + s411 * sinvalue_lookup[3] - s510 * sinvalue_lookup[1] + + s69 * sinvalue_lookup[7] - s78 * sinvalue_lookup[2]; + output[10] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = d015 * sinvalue_lookup[4] - d114 * sinvalue_lookup[6] + + d213 * sinvalue_lookup[1] + d312 * sinvalue_lookup[2] - + d411 * sinvalue_lookup[7] + d510 * sinvalue_lookup[3] + + d69 * sinvalue_lookup[0] - d78 * sinvalue_lookup[5]; + output[11] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = s015 * sinvalue_lookup[3] - s114 * sinvalue_lookup[7] + + s213 * sinvalue_lookup[4] - s312 * sinvalue_lookup[0] - + s411 * sinvalue_lookup[2] + s510 * sinvalue_lookup[6] - + s69 * sinvalue_lookup[5] + s78 * sinvalue_lookup[1]; + output[12] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = d015 * sinvalue_lookup[2] - d114 * sinvalue_lookup[5] + + d213 * sinvalue_lookup[7] - d312 * sinvalue_lookup[4] + + d411 * sinvalue_lookup[1] + d510 * sinvalue_lookup[0] - + d69 * sinvalue_lookup[3] + d78 * sinvalue_lookup[6]; + output[13] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = s015 * sinvalue_lookup[1] - s114 * sinvalue_lookup[3] + + s213 * sinvalue_lookup[5] - s312 * sinvalue_lookup[7] + + s411 * sinvalue_lookup[6] - s510 * sinvalue_lookup[4] + + s69 * sinvalue_lookup[2] - s78 * sinvalue_lookup[0]; + output[14] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); + sum = d015 * sinvalue_lookup[0] - d114 * sinvalue_lookup[1] + + d213 * sinvalue_lookup[2] - d312 * sinvalue_lookup[3] + + d411 * sinvalue_lookup[4] - d510 * sinvalue_lookup[5] + + d69 * sinvalue_lookup[6] - d78 * sinvalue_lookup[7]; + output[15] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd); } #endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_DST1 diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index b8c125ef6..f322c8f8e 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -29,37 +29,167 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) { #if CONFIG_EXT_TX #if CONFIG_DST1 -static INLINE void vp9_fdst_core(const tran_low_t *input, tran_low_t *output, - const int N, const int32_t* dst_lookup) { - int i, j; - for (i = 0; i < N; i++) { - int64_t sum = 0; - for (j = 0; j < N; j++) { - int idx = (i + 1) * (j + 1); - int sign = 0; - if (idx > N + 1) { - sign = (idx / (N + 1)) & 1; - idx %= (N + 1); - } - idx = MIN(idx, N + 1 - idx); - if (idx == 0) continue; - idx--; - sum += (int64_t)input[j] * dst_lookup[idx] * (sign ? -1 : 1); - } - output[i] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); - } -} - void vp9_fdst4(const tran_low_t *input, tran_low_t *output) { - vp9_fdst_core(input, output, 4, dst_lookup4); + // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2) + static const int32_t sinvalue_lookup[] = { + 141124871, 228344838, + }; + int64_t sum; + int64_t s03 = (input[0] + input[3]); + int64_t d03 = (input[0] - input[3]); + int64_t s12 = (input[1] + input[2]); + int64_t d12 = (input[1] - input[2]); + sum = s03 * sinvalue_lookup[0] + s12 * sinvalue_lookup[1]; + output[0] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = d03 * sinvalue_lookup[1] + d12 * sinvalue_lookup[0]; + output[1] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = s03 * sinvalue_lookup[1] - s12 * sinvalue_lookup[0]; + output[2] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = d03 * sinvalue_lookup[0] - d12 * sinvalue_lookup[1]; + output[3] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); } void vp9_fdst8(const tran_low_t *input, tran_low_t *output) { - vp9_fdst_core(input, output, 8, dst_lookup8); + // {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2 + static const int sinvalue_lookup[] = { + 86559612, 162678858, 219176632, 249238470 + }; + int64_t sum; + int64_t s07 = (input[0] + input[7]); + int64_t d07 = (input[0] - input[7]); + int64_t s16 = (input[1] + input[6]); + int64_t d16 = (input[1] - input[6]); + int64_t s25 = (input[2] + input[5]); + int64_t d25 = (input[2] - input[5]); + int64_t s34 = (input[3] + input[4]); + int64_t d34 = (input[3] - input[4]); + sum = s07 * sinvalue_lookup[0] + s16 * sinvalue_lookup[1] + + s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[3]; + output[0] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = d07 * sinvalue_lookup[1] + d16 * sinvalue_lookup[3] + + d25 * sinvalue_lookup[2] + d34 * sinvalue_lookup[0]; + output[1] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = (s07 + s16 - s34)* sinvalue_lookup[2]; + output[2] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = d07 * sinvalue_lookup[3] + d16 * sinvalue_lookup[0] - + d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[1]; + output[3] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = s07 * sinvalue_lookup[3] - s16 * sinvalue_lookup[0] - + s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[1]; + output[4] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = (d07 - d16 + d34)* sinvalue_lookup[2]; + output[5] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = s07 * sinvalue_lookup[1] - s16 * sinvalue_lookup[3] + + s25 * sinvalue_lookup[2] - s34 * sinvalue_lookup[0]; + output[6] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = d07 * sinvalue_lookup[0] - d16 * sinvalue_lookup[1] + + d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[3]; + output[7] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); } void vp9_fdst16(const tran_low_t *input, tran_low_t *output) { - vp9_fdst_core(input, output, 16, dst_lookup16); + // {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2) + static const int sinvalue_lookup[] = { + 47852167, 94074787, 137093803, 175444254, + 207820161, 233119001, 250479254, 259309736 + }; + int64_t sum; + int64_t s015 = (input[0] + input[15]); + int64_t d015 = (input[0] - input[15]); + int64_t s114 = (input[1] + input[14]); + int64_t d114 = (input[1] - input[14]); + int64_t s213 = (input[2] + input[13]); + int64_t d213 = (input[2] - input[13]); + int64_t s312 = (input[3] + input[12]); + int64_t d312 = (input[3] - input[12]); + int64_t s411 = (input[4] + input[11]); + int64_t d411 = (input[4] - input[11]); + int64_t s510 = (input[5] + input[10]); + int64_t d510 = (input[5] - input[10]); + int64_t s69 = (input[6] + input[9]); + int64_t d69 = (input[6] - input[9]); + int64_t s78 = (input[7] + input[8]); + int64_t d78 = (input[7] - input[8]); + sum = s015 * sinvalue_lookup[0] + s114 * sinvalue_lookup[1] + + s213 * sinvalue_lookup[2] + s312 * sinvalue_lookup[3] + + s411 * sinvalue_lookup[4] + s510 * sinvalue_lookup[5] + + s69 * sinvalue_lookup[6] + s78 * sinvalue_lookup[7]; + output[0] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = d015 * sinvalue_lookup[1] + d114 * sinvalue_lookup[3] + + d213 * sinvalue_lookup[5] + d312 * sinvalue_lookup[7] + + d411 * sinvalue_lookup[6] + d510 * sinvalue_lookup[4] + + d69 * sinvalue_lookup[2] + d78 * sinvalue_lookup[0]; + output[1] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = s015 * sinvalue_lookup[2] + s114 * sinvalue_lookup[5] + + s213 * sinvalue_lookup[7] + s312 * sinvalue_lookup[4] + + s411 * sinvalue_lookup[1] - s510 * sinvalue_lookup[0] - + s69 * sinvalue_lookup[3] - s78 * sinvalue_lookup[6]; + output[2] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = d015 * sinvalue_lookup[3] + d114 * sinvalue_lookup[7] + + d213 * sinvalue_lookup[4] + d312 * sinvalue_lookup[0] - + d411 * sinvalue_lookup[2] - d510 * sinvalue_lookup[6] - + d69 * sinvalue_lookup[5] - d78 * sinvalue_lookup[1]; + output[3] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = s015 * sinvalue_lookup[4] + s114 * sinvalue_lookup[6] + + s213 * sinvalue_lookup[1] - s312 * sinvalue_lookup[2] - + s411 * sinvalue_lookup[7] - s510 * sinvalue_lookup[3] + + s69 * sinvalue_lookup[0] + s78 * sinvalue_lookup[5]; + output[4] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = d015 * sinvalue_lookup[5] + d114 * sinvalue_lookup[4] - + d213 * sinvalue_lookup[0] - d312 * sinvalue_lookup[6] - + d411 * sinvalue_lookup[3] + d510 * sinvalue_lookup[1] + + d69 * sinvalue_lookup[7] + d78 * sinvalue_lookup[2]; + output[5] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = s015 * sinvalue_lookup[6] + s114 * sinvalue_lookup[2] - + s213 * sinvalue_lookup[3] - s312 * sinvalue_lookup[5] + + s411 * sinvalue_lookup[0] + s510 * sinvalue_lookup[7] + + s69 * sinvalue_lookup[1] - s78 * sinvalue_lookup[4]; + output[6] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = d015 * sinvalue_lookup[7] + d114 * sinvalue_lookup[0] - + d213 * sinvalue_lookup[6] - d312 * sinvalue_lookup[1] + + d411 * sinvalue_lookup[5] + d510 * sinvalue_lookup[2] - + d69 * sinvalue_lookup[4] - d78 * sinvalue_lookup[3]; + output[7] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = s015 * sinvalue_lookup[7] - s114 * sinvalue_lookup[0] - + s213 * sinvalue_lookup[6] + s312 * sinvalue_lookup[1] + + s411 * sinvalue_lookup[5] - s510 * sinvalue_lookup[2] - + s69 * sinvalue_lookup[4] + s78 * sinvalue_lookup[3]; + output[8] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = d015 * sinvalue_lookup[6] - d114 * sinvalue_lookup[2] - + d213 * sinvalue_lookup[3] + d312 * sinvalue_lookup[5] + + d411 * sinvalue_lookup[0] - d510 * sinvalue_lookup[7] + + d69 * sinvalue_lookup[1] + d78 * sinvalue_lookup[4]; + output[9] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = s015 * sinvalue_lookup[5] - s114 * sinvalue_lookup[4] - + s213 * sinvalue_lookup[0] + s312 * sinvalue_lookup[6] - + s411 * sinvalue_lookup[3] - s510 * sinvalue_lookup[1] + + s69 * sinvalue_lookup[7] - s78 * sinvalue_lookup[2]; + output[10] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = d015 * sinvalue_lookup[4] - d114 * sinvalue_lookup[6] + + d213 * sinvalue_lookup[1] + d312 * sinvalue_lookup[2] - + d411 * sinvalue_lookup[7] + d510 * sinvalue_lookup[3] + + d69 * sinvalue_lookup[0] - d78 * sinvalue_lookup[5]; + output[11] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = s015 * sinvalue_lookup[3] - s114 * sinvalue_lookup[7] + + s213 * sinvalue_lookup[4] - s312 * sinvalue_lookup[0] - + s411 * sinvalue_lookup[2] + s510 * sinvalue_lookup[6] - + s69 * sinvalue_lookup[5] + s78 * sinvalue_lookup[1]; + output[12] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = d015 * sinvalue_lookup[2] - d114 * sinvalue_lookup[5] + + d213 * sinvalue_lookup[7] - d312 * sinvalue_lookup[4] + + d411 * sinvalue_lookup[1] + d510 * sinvalue_lookup[0] - + d69 * sinvalue_lookup[3] + d78 * sinvalue_lookup[6]; + output[13] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = s015 * sinvalue_lookup[1] - s114 * sinvalue_lookup[3] + + s213 * sinvalue_lookup[5] - s312 * sinvalue_lookup[7] + + s411 * sinvalue_lookup[6] - s510 * sinvalue_lookup[4] + + s69 * sinvalue_lookup[2] - s78 * sinvalue_lookup[0]; + output[14] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); + sum = d015 * sinvalue_lookup[0] - d114 * sinvalue_lookup[1] + + d213 * sinvalue_lookup[2] - d312 * sinvalue_lookup[3] + + d411 * sinvalue_lookup[4] - d510 * sinvalue_lookup[5] + + d69 * sinvalue_lookup[6] - d78 * sinvalue_lookup[7]; + output[15] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)); } #endif // CONFIG_DST1 #endif // CONFIG_EXT_TX