granicus.if.org Git - libvpx/blob - vpx_dsp/inv_txfm.c

   1 /*
   2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <math.h>
  12 #include <stdlib.h>
  13 #include <string.h>
  14
  15 #include "./vpx_dsp_rtcd.h"
  16 #include "vpx_dsp/inv_txfm.h"
  17
  18 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  19   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  20      0.5 shifts per pixel. */
  21   int i;
  22   tran_low_t output[16];
  23   tran_high_t a1, b1, c1, d1, e1;
  24   const tran_low_t *ip = input;
  25   tran_low_t *op = output;
  26
  27   for (i = 0; i < 4; i++) {
  28     a1 = ip[0] >> UNIT_QUANT_SHIFT;
  29     c1 = ip[1] >> UNIT_QUANT_SHIFT;
  30     d1 = ip[2] >> UNIT_QUANT_SHIFT;
  31     b1 = ip[3] >> UNIT_QUANT_SHIFT;
  32     a1 += c1;
  33     d1 -= b1;
  34     e1 = (a1 - d1) >> 1;
  35     b1 = e1 - b1;
  36     c1 = e1 - c1;
  37     a1 -= b1;
  38     d1 += c1;
  39     op[0] = WRAPLOW(a1);
  40     op[1] = WRAPLOW(b1);
  41     op[2] = WRAPLOW(c1);
  42     op[3] = WRAPLOW(d1);
  43     ip += 4;
  44     op += 4;
  45   }
  46
  47   ip = output;
  48   for (i = 0; i < 4; i++) {
  49     a1 = ip[4 * 0];
  50     c1 = ip[4 * 1];
  51     d1 = ip[4 * 2];
  52     b1 = ip[4 * 3];
  53     a1 += c1;
  54     d1 -= b1;
  55     e1 = (a1 - d1) >> 1;
  56     b1 = e1 - b1;
  57     c1 = e1 - c1;
  58     a1 -= b1;
  59     d1 += c1;
  60     dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
  61     dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
  62     dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
  63     dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
  64
  65     ip++;
  66     dest++;
  67   }
  68 }
  69
  70 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
  71   int i;
  72   tran_high_t a1, e1;
  73   tran_low_t tmp[4];
  74   const tran_low_t *ip = in;
  75   tran_low_t *op = tmp;
  76
  77   a1 = ip[0] >> UNIT_QUANT_SHIFT;
  78   e1 = a1 >> 1;
  79   a1 -= e1;
  80   op[0] = WRAPLOW(a1);
  81   op[1] = op[2] = op[3] = WRAPLOW(e1);
  82
  83   ip = tmp;
  84   for (i = 0; i < 4; i++) {
  85     e1 = ip[0] >> 1;
  86     a1 = ip[0] - e1;
  87     dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
  88     dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
  89     dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
  90     dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
  91     ip++;
  92     dest++;
  93   }
  94 }
  95
  96 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
  97   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  98   tran_low_t x0 = input[0];
  99   tran_low_t x1 = input[1];
 100   tran_low_t x2 = input[2];
 101   tran_low_t x3 = input[3];
 102
 103   if (!(x0 | x1 | x2 | x3)) {
 104     memset(output, 0, 4 * sizeof(*output));
 105     return;
 106   }
 107
 108   s0 = sinpi_1_9 * x0;
 109   s1 = sinpi_2_9 * x0;
 110   s2 = sinpi_3_9 * x1;
 111   s3 = sinpi_4_9 * x2;
 112   s4 = sinpi_1_9 * x2;
 113   s5 = sinpi_2_9 * x3;
 114   s6 = sinpi_4_9 * x3;
 115   s7 = WRAPLOW(x0 - x2 + x3);
 116
 117   s0 = s0 + s3 + s5;
 118   s1 = s1 - s4 - s6;
 119   s3 = s2;
 120   s2 = sinpi_3_9 * s7;
 121
 122   // 1-D transform scaling factor is sqrt(2).
 123   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
 124   // + 1b (addition) = 29b.
 125   // Hence the output bit depth is 15b.
 126   output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
 127   output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
 128   output[2] = WRAPLOW(dct_const_round_shift(s2));
 129   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
 130 }
 131
 132 void idct4_c(const tran_low_t *input, tran_low_t *output) {
 133   tran_low_t step[4];
 134   tran_high_t temp1, temp2;
 135
 136   // stage 1
 137   temp1 = (input[0] + input[2]) * cospi_16_64;
 138   temp2 = (input[0] - input[2]) * cospi_16_64;
 139   step[0] = WRAPLOW(dct_const_round_shift(temp1));
 140   step[1] = WRAPLOW(dct_const_round_shift(temp2));
 141   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
 142   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
 143   step[2] = WRAPLOW(dct_const_round_shift(temp1));
 144   step[3] = WRAPLOW(dct_const_round_shift(temp2));
 145
 146   // stage 2
 147   output[0] = WRAPLOW(step[0] + step[3]);
 148   output[1] = WRAPLOW(step[1] + step[2]);
 149   output[2] = WRAPLOW(step[1] - step[2]);
 150   output[3] = WRAPLOW(step[0] - step[3]);
 151 }
 152
 153 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 154   int i, j;
 155   tran_low_t out[4 * 4];
 156   tran_low_t *outptr = out;
 157   tran_low_t temp_in[4], temp_out[4];
 158
 159   // Rows
 160   for (i = 0; i < 4; ++i) {
 161     idct4_c(input, outptr);
 162     input += 4;
 163     outptr += 4;
 164   }
 165
 166   // Columns
 167   for (i = 0; i < 4; ++i) {
 168     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
 169     idct4_c(temp_in, temp_out);
 170     for (j = 0; j < 4; ++j) {
 171       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 172                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
 173     }
 174   }
 175 }
 176
 177 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 178   int i;
 179   tran_high_t a1;
 180   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
 181
 182   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
 183   a1 = ROUND_POWER_OF_TWO(out, 4);
 184
 185   for (i = 0; i < 4; i++) {
 186     dest[0] = clip_pixel_add(dest[0], a1);
 187     dest[1] = clip_pixel_add(dest[1], a1);
 188     dest[2] = clip_pixel_add(dest[2], a1);
 189     dest[3] = clip_pixel_add(dest[3], a1);
 190     dest += stride;
 191   }
 192 }
 193
 194 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
 195   int s0, s1, s2, s3, s4, s5, s6, s7;
 196   tran_high_t x0 = input[7];
 197   tran_high_t x1 = input[0];
 198   tran_high_t x2 = input[5];
 199   tran_high_t x3 = input[2];
 200   tran_high_t x4 = input[3];
 201   tran_high_t x5 = input[4];
 202   tran_high_t x6 = input[1];
 203   tran_high_t x7 = input[6];
 204
 205   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
 206     memset(output, 0, 8 * sizeof(*output));
 207     return;
 208   }
 209
 210   // stage 1
 211   s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
 212   s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
 213   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
 214   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
 215   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
 216   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
 217   s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
 218   s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
 219
 220   x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
 221   x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
 222   x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
 223   x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
 224   x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
 225   x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
 226   x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
 227   x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
 228
 229   // stage 2
 230   s0 = (int)x0;
 231   s1 = (int)x1;
 232   s2 = (int)x2;
 233   s3 = (int)x3;
 234   s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
 235   s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
 236   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
 237   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
 238
 239   x0 = WRAPLOW(s0 + s2);
 240   x1 = WRAPLOW(s1 + s3);
 241   x2 = WRAPLOW(s0 - s2);
 242   x3 = WRAPLOW(s1 - s3);
 243   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
 244   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
 245   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
 246   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
 247
 248   // stage 3
 249   s2 = (int)(cospi_16_64 * (x2 + x3));
 250   s3 = (int)(cospi_16_64 * (x2 - x3));
 251   s6 = (int)(cospi_16_64 * (x6 + x7));
 252   s7 = (int)(cospi_16_64 * (x6 - x7));
 253
 254   x2 = WRAPLOW(dct_const_round_shift(s2));
 255   x3 = WRAPLOW(dct_const_round_shift(s3));
 256   x6 = WRAPLOW(dct_const_round_shift(s6));
 257   x7 = WRAPLOW(dct_const_round_shift(s7));
 258
 259   output[0] = WRAPLOW(x0);
 260   output[1] = WRAPLOW(-x4);
 261   output[2] = WRAPLOW(x6);
 262   output[3] = WRAPLOW(-x2);
 263   output[4] = WRAPLOW(x3);
 264   output[5] = WRAPLOW(-x7);
 265   output[6] = WRAPLOW(x5);
 266   output[7] = WRAPLOW(-x1);
 267 }
 268
 269 void idct8_c(const tran_low_t *input, tran_low_t *output) {
 270   tran_low_t step1[8], step2[8];
 271   tran_high_t temp1, temp2;
 272
 273   // stage 1
 274   step1[0] = input[0];
 275   step1[2] = input[4];
 276   step1[1] = input[2];
 277   step1[3] = input[6];
 278   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
 279   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
 280   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
 281   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
 282   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
 283   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
 284   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
 285   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 286
 287   // stage 2
 288   temp1 = (step1[0] + step1[2]) * cospi_16_64;
 289   temp2 = (step1[0] - step1[2]) * cospi_16_64;
 290   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
 291   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
 292   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
 293   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
 294   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
 295   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
 296   step2[4] = WRAPLOW(step1[4] + step1[5]);
 297   step2[5] = WRAPLOW(step1[4] - step1[5]);
 298   step2[6] = WRAPLOW(-step1[6] + step1[7]);
 299   step2[7] = WRAPLOW(step1[6] + step1[7]);
 300
 301   // stage 3
 302   step1[0] = WRAPLOW(step2[0] + step2[3]);
 303   step1[1] = WRAPLOW(step2[1] + step2[2]);
 304   step1[2] = WRAPLOW(step2[1] - step2[2]);
 305   step1[3] = WRAPLOW(step2[0] - step2[3]);
 306   step1[4] = step2[4];
 307   temp1 = (step2[6] - step2[5]) * cospi_16_64;
 308   temp2 = (step2[5] + step2[6]) * cospi_16_64;
 309   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
 310   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 311   step1[7] = step2[7];
 312
 313   // stage 4
 314   output[0] = WRAPLOW(step1[0] + step1[7]);
 315   output[1] = WRAPLOW(step1[1] + step1[6]);
 316   output[2] = WRAPLOW(step1[2] + step1[5]);
 317   output[3] = WRAPLOW(step1[3] + step1[4]);
 318   output[4] = WRAPLOW(step1[3] - step1[4]);
 319   output[5] = WRAPLOW(step1[2] - step1[5]);
 320   output[6] = WRAPLOW(step1[1] - step1[6]);
 321   output[7] = WRAPLOW(step1[0] - step1[7]);
 322 }
 323
 324 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 325   int i, j;
 326   tran_low_t out[8 * 8];
 327   tran_low_t *outptr = out;
 328   tran_low_t temp_in[8], temp_out[8];
 329
 330   // First transform rows
 331   for (i = 0; i < 8; ++i) {
 332     idct8_c(input, outptr);
 333     input += 8;
 334     outptr += 8;
 335   }
 336
 337   // Then transform columns
 338   for (i = 0; i < 8; ++i) {
 339     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
 340     idct8_c(temp_in, temp_out);
 341     for (j = 0; j < 8; ++j) {
 342       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 343                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
 344     }
 345   }
 346 }
 347
 348 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 349   int i, j;
 350   tran_low_t out[8 * 8] = { 0 };
 351   tran_low_t *outptr = out;
 352   tran_low_t temp_in[8], temp_out[8];
 353
 354   // First transform rows
 355   // Only first 4 row has non-zero coefs
 356   for (i = 0; i < 4; ++i) {
 357     idct8_c(input, outptr);
 358     input += 8;
 359     outptr += 8;
 360   }
 361
 362   // Then transform columns
 363   for (i = 0; i < 8; ++i) {
 364     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
 365     idct8_c(temp_in, temp_out);
 366     for (j = 0; j < 8; ++j) {
 367       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 368                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
 369     }
 370   }
 371 }
 372
 373 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 374   int i, j;
 375   tran_high_t a1;
 376   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
 377
 378   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
 379   a1 = ROUND_POWER_OF_TWO(out, 5);
 380   for (j = 0; j < 8; ++j) {
 381     for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
 382     dest += stride;
 383   }
 384 }
 385
 386 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
 387   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
 388   tran_high_t s9, s10, s11, s12, s13, s14, s15;
 389   tran_high_t x0 = input[15];
 390   tran_high_t x1 = input[0];
 391   tran_high_t x2 = input[13];
 392   tran_high_t x3 = input[2];
 393   tran_high_t x4 = input[11];
 394   tran_high_t x5 = input[4];
 395   tran_high_t x6 = input[9];
 396   tran_high_t x7 = input[6];
 397   tran_high_t x8 = input[7];
 398   tran_high_t x9 = input[8];
 399   tran_high_t x10 = input[5];
 400   tran_high_t x11 = input[10];
 401   tran_high_t x12 = input[3];
 402   tran_high_t x13 = input[12];
 403   tran_high_t x14 = input[1];
 404   tran_high_t x15 = input[14];
 405
 406   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
 407         x13 | x14 | x15)) {
 408     memset(output, 0, 16 * sizeof(*output));
 409     return;
 410   }
 411
 412   // stage 1
 413   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
 414   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
 415   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
 416   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
 417   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
 418   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
 419   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
 420   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
 421   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
 422   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
 423   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
 424   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
 425   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
 426   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
 427   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
 428   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
 429
 430   x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
 431   x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
 432   x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
 433   x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
 434   x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
 435   x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
 436   x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
 437   x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
 438   x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
 439   x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
 440   x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
 441   x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
 442   x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
 443   x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
 444   x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
 445   x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
 446
 447   // stage 2
 448   s0 = x0;
 449   s1 = x1;
 450   s2 = x2;
 451   s3 = x3;
 452   s4 = x4;
 453   s5 = x5;
 454   s6 = x6;
 455   s7 = x7;
 456   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
 457   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
 458   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
 459   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
 460   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
 461   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
 462   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
 463   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
 464
 465   x0 = WRAPLOW(s0 + s4);
 466   x1 = WRAPLOW(s1 + s5);
 467   x2 = WRAPLOW(s2 + s6);
 468   x3 = WRAPLOW(s3 + s7);
 469   x4 = WRAPLOW(s0 - s4);
 470   x5 = WRAPLOW(s1 - s5);
 471   x6 = WRAPLOW(s2 - s6);
 472   x7 = WRAPLOW(s3 - s7);
 473   x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
 474   x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
 475   x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
 476   x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
 477   x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
 478   x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
 479   x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
 480   x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
 481
 482   // stage 3
 483   s0 = x0;
 484   s1 = x1;
 485   s2 = x2;
 486   s3 = x3;
 487   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
 488   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
 489   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
 490   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
 491   s8 = x8;
 492   s9 = x9;
 493   s10 = x10;
 494   s11 = x11;
 495   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
 496   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
 497   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
 498   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
 499
 500   x0 = WRAPLOW(s0 + s2);
 501   x1 = WRAPLOW(s1 + s3);
 502   x2 = WRAPLOW(s0 - s2);
 503   x3 = WRAPLOW(s1 - s3);
 504   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
 505   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
 506   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
 507   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
 508   x8 = WRAPLOW(s8 + s10);
 509   x9 = WRAPLOW(s9 + s11);
 510   x10 = WRAPLOW(s8 - s10);
 511   x11 = WRAPLOW(s9 - s11);
 512   x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
 513   x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
 514   x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
 515   x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
 516
 517   // stage 4
 518   s2 = (-cospi_16_64) * (x2 + x3);
 519   s3 = cospi_16_64 * (x2 - x3);
 520   s6 = cospi_16_64 * (x6 + x7);
 521   s7 = cospi_16_64 * (-x6 + x7);
 522   s10 = cospi_16_64 * (x10 + x11);
 523   s11 = cospi_16_64 * (-x10 + x11);
 524   s14 = (-cospi_16_64) * (x14 + x15);
 525   s15 = cospi_16_64 * (x14 - x15);
 526
 527   x2 = WRAPLOW(dct_const_round_shift(s2));
 528   x3 = WRAPLOW(dct_const_round_shift(s3));
 529   x6 = WRAPLOW(dct_const_round_shift(s6));
 530   x7 = WRAPLOW(dct_const_round_shift(s7));
 531   x10 = WRAPLOW(dct_const_round_shift(s10));
 532   x11 = WRAPLOW(dct_const_round_shift(s11));
 533   x14 = WRAPLOW(dct_const_round_shift(s14));
 534   x15 = WRAPLOW(dct_const_round_shift(s15));
 535
 536   output[0] = WRAPLOW(x0);
 537   output[1] = WRAPLOW(-x8);
 538   output[2] = WRAPLOW(x12);
 539   output[3] = WRAPLOW(-x4);
 540   output[4] = WRAPLOW(x6);
 541   output[5] = WRAPLOW(x14);
 542   output[6] = WRAPLOW(x10);
 543   output[7] = WRAPLOW(x2);
 544   output[8] = WRAPLOW(x3);
 545   output[9] = WRAPLOW(x11);
 546   output[10] = WRAPLOW(x15);
 547   output[11] = WRAPLOW(x7);
 548   output[12] = WRAPLOW(x5);
 549   output[13] = WRAPLOW(-x13);
 550   output[14] = WRAPLOW(x9);
 551   output[15] = WRAPLOW(-x1);
 552 }
 553
 554 void idct16_c(const tran_low_t *input, tran_low_t *output) {
 555   tran_low_t step1[16], step2[16];
 556   tran_high_t temp1, temp2;
 557
 558   // stage 1
 559   step1[0] = input[0 / 2];
 560   step1[1] = input[16 / 2];
 561   step1[2] = input[8 / 2];
 562   step1[3] = input[24 / 2];
 563   step1[4] = input[4 / 2];
 564   step1[5] = input[20 / 2];
 565   step1[6] = input[12 / 2];
 566   step1[7] = input[28 / 2];
 567   step1[8] = input[2 / 2];
 568   step1[9] = input[18 / 2];
 569   step1[10] = input[10 / 2];
 570   step1[11] = input[26 / 2];
 571   step1[12] = input[6 / 2];
 572   step1[13] = input[22 / 2];
 573   step1[14] = input[14 / 2];
 574   step1[15] = input[30 / 2];
 575
 576   // stage 2
 577   step2[0] = step1[0];
 578   step2[1] = step1[1];
 579   step2[2] = step1[2];
 580   step2[3] = step1[3];
 581   step2[4] = step1[4];
 582   step2[5] = step1[5];
 583   step2[6] = step1[6];
 584   step2[7] = step1[7];
 585
 586   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
 587   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
 588   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
 589   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 590
 591   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
 592   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
 593   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
 594   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 595
 596   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
 597   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
 598   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
 599   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 600
 601   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
 602   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
 603   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
 604   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
 605
 606   // stage 3
 607   step1[0] = step2[0];
 608   step1[1] = step2[1];
 609   step1[2] = step2[2];
 610   step1[3] = step2[3];
 611
 612   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
 613   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
 614   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
 615   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
 616   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
 617   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
 618   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
 619   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 620
 621   step1[8] = WRAPLOW(step2[8] + step2[9]);
 622   step1[9] = WRAPLOW(step2[8] - step2[9]);
 623   step1[10] = WRAPLOW(-step2[10] + step2[11]);
 624   step1[11] = WRAPLOW(step2[10] + step2[11]);
 625   step1[12] = WRAPLOW(step2[12] + step2[13]);
 626   step1[13] = WRAPLOW(step2[12] - step2[13]);
 627   step1[14] = WRAPLOW(-step2[14] + step2[15]);
 628   step1[15] = WRAPLOW(step2[14] + step2[15]);
 629
 630   // stage 4
 631   temp1 = (step1[0] + step1[1]) * cospi_16_64;
 632   temp2 = (step1[0] - step1[1]) * cospi_16_64;
 633   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
 634   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
 635   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
 636   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
 637   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
 638   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
 639   step2[4] = WRAPLOW(step1[4] + step1[5]);
 640   step2[5] = WRAPLOW(step1[4] - step1[5]);
 641   step2[6] = WRAPLOW(-step1[6] + step1[7]);
 642   step2[7] = WRAPLOW(step1[6] + step1[7]);
 643
 644   step2[8] = step1[8];
 645   step2[15] = step1[15];
 646   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
 647   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
 648   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
 649   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 650   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
 651   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
 652   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
 653   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 654   step2[11] = step1[11];
 655   step2[12] = step1[12];
 656
 657   // stage 5
 658   step1[0] = WRAPLOW(step2[0] + step2[3]);
 659   step1[1] = WRAPLOW(step2[1] + step2[2]);
 660   step1[2] = WRAPLOW(step2[1] - step2[2]);
 661   step1[3] = WRAPLOW(step2[0] - step2[3]);
 662   step1[4] = step2[4];
 663   temp1 = (step2[6] - step2[5]) * cospi_16_64;
 664   temp2 = (step2[5] + step2[6]) * cospi_16_64;
 665   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
 666   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 667   step1[7] = step2[7];
 668
 669   step1[8] = WRAPLOW(step2[8] + step2[11]);
 670   step1[9] = WRAPLOW(step2[9] + step2[10]);
 671   step1[10] = WRAPLOW(step2[9] - step2[10]);
 672   step1[11] = WRAPLOW(step2[8] - step2[11]);
 673   step1[12] = WRAPLOW(-step2[12] + step2[15]);
 674   step1[13] = WRAPLOW(-step2[13] + step2[14]);
 675   step1[14] = WRAPLOW(step2[13] + step2[14]);
 676   step1[15] = WRAPLOW(step2[12] + step2[15]);
 677
 678   // stage 6
 679   step2[0] = WRAPLOW(step1[0] + step1[7]);
 680   step2[1] = WRAPLOW(step1[1] + step1[6]);
 681   step2[2] = WRAPLOW(step1[2] + step1[5]);
 682   step2[3] = WRAPLOW(step1[3] + step1[4]);
 683   step2[4] = WRAPLOW(step1[3] - step1[4]);
 684   step2[5] = WRAPLOW(step1[2] - step1[5]);
 685   step2[6] = WRAPLOW(step1[1] - step1[6]);
 686   step2[7] = WRAPLOW(step1[0] - step1[7]);
 687   step2[8] = step1[8];
 688   step2[9] = step1[9];
 689   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
 690   temp2 = (step1[10] + step1[13]) * cospi_16_64;
 691   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
 692   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 693   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
 694   temp2 = (step1[11] + step1[12]) * cospi_16_64;
 695   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
 696   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
 697   step2[14] = step1[14];
 698   step2[15] = step1[15];
 699
 700   // stage 7
 701   output[0] = WRAPLOW(step2[0] + step2[15]);
 702   output[1] = WRAPLOW(step2[1] + step2[14]);
 703   output[2] = WRAPLOW(step2[2] + step2[13]);
 704   output[3] = WRAPLOW(step2[3] + step2[12]);
 705   output[4] = WRAPLOW(step2[4] + step2[11]);
 706   output[5] = WRAPLOW(step2[5] + step2[10]);
 707   output[6] = WRAPLOW(step2[6] + step2[9]);
 708   output[7] = WRAPLOW(step2[7] + step2[8]);
 709   output[8] = WRAPLOW(step2[7] - step2[8]);
 710   output[9] = WRAPLOW(step2[6] - step2[9]);
 711   output[10] = WRAPLOW(step2[5] - step2[10]);
 712   output[11] = WRAPLOW(step2[4] - step2[11]);
 713   output[12] = WRAPLOW(step2[3] - step2[12]);
 714   output[13] = WRAPLOW(step2[2] - step2[13]);
 715   output[14] = WRAPLOW(step2[1] - step2[14]);
 716   output[15] = WRAPLOW(step2[0] - step2[15]);
 717 }
 718
 719 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
 720                              int stride) {
 721   int i, j;
 722   tran_low_t out[16 * 16];
 723   tran_low_t *outptr = out;
 724   tran_low_t temp_in[16], temp_out[16];
 725
 726   // First transform rows
 727   for (i = 0; i < 16; ++i) {
 728     idct16_c(input, outptr);
 729     input += 16;
 730     outptr += 16;
 731   }
 732
 733   // Then transform columns
 734   for (i = 0; i < 16; ++i) {
 735     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
 736     idct16_c(temp_in, temp_out);
 737     for (j = 0; j < 16; ++j) {
 738       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 739                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
 740     }
 741   }
 742 }
 743
 744 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
 745                             int stride) {
 746   int i, j;
 747   tran_low_t out[16 * 16] = { 0 };
 748   tran_low_t *outptr = out;
 749   tran_low_t temp_in[16], temp_out[16];
 750
 751   // First transform rows. Since all non-zero dct coefficients are in
 752   // upper-left 8x8 area, we only need to calculate first 8 rows here.
 753   for (i = 0; i < 8; ++i) {
 754     idct16_c(input, outptr);
 755     input += 16;
 756     outptr += 16;
 757   }
 758
 759   // Then transform columns
 760   for (i = 0; i < 16; ++i) {
 761     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
 762     idct16_c(temp_in, temp_out);
 763     for (j = 0; j < 16; ++j) {
 764       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 765                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
 766     }
 767   }
 768 }
 769
 770 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
 771                             int stride) {
 772   int i, j;
 773   tran_low_t out[16 * 16] = { 0 };
 774   tran_low_t *outptr = out;
 775   tran_low_t temp_in[16], temp_out[16];
 776
 777   // First transform rows. Since all non-zero dct coefficients are in
 778   // upper-left 4x4 area, we only need to calculate first 4 rows here.
 779   for (i = 0; i < 4; ++i) {
 780     idct16_c(input, outptr);
 781     input += 16;
 782     outptr += 16;
 783   }
 784
 785   // Then transform columns
 786   for (i = 0; i < 16; ++i) {
 787     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
 788     idct16_c(temp_in, temp_out);
 789     for (j = 0; j < 16; ++j) {
 790       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 791                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
 792     }
 793   }
 794 }
 795
 796 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 797   int i, j;
 798   tran_high_t a1;
 799   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
 800
 801   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
 802   a1 = ROUND_POWER_OF_TWO(out, 6);
 803   for (j = 0; j < 16; ++j) {
 804     for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
 805     dest += stride;
 806   }
 807 }
 808
 809 void idct32_c(const tran_low_t *input, tran_low_t *output) {
 810   tran_low_t step1[32], step2[32];
 811   tran_high_t temp1, temp2;
 812
 813   // stage 1
 814   step1[0] = input[0];
 815   step1[1] = input[16];
 816   step1[2] = input[8];
 817   step1[3] = input[24];
 818   step1[4] = input[4];
 819   step1[5] = input[20];
 820   step1[6] = input[12];
 821   step1[7] = input[28];
 822   step1[8] = input[2];
 823   step1[9] = input[18];
 824   step1[10] = input[10];
 825   step1[11] = input[26];
 826   step1[12] = input[6];
 827   step1[13] = input[22];
 828   step1[14] = input[14];
 829   step1[15] = input[30];
 830
 831   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
 832   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
 833   step1[16] = WRAPLOW(dct_const_round_shift(temp1));
 834   step1[31] = WRAPLOW(dct_const_round_shift(temp2));
 835
 836   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
 837   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
 838   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
 839   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
 840
 841   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
 842   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
 843   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
 844   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
 845
 846   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
 847   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
 848   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
 849   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
 850
 851   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
 852   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
 853   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
 854   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
 855
 856   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
 857   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
 858   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
 859   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
 860
 861   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
 862   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
 863   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
 864   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
 865
 866   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
 867   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
 868   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
 869   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
 870
 871   // stage 2
 872   step2[0] = step1[0];
 873   step2[1] = step1[1];
 874   step2[2] = step1[2];
 875   step2[3] = step1[3];
 876   step2[4] = step1[4];
 877   step2[5] = step1[5];
 878   step2[6] = step1[6];
 879   step2[7] = step1[7];
 880
 881   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
 882   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
 883   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
 884   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 885
 886   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
 887   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
 888   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
 889   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 890
 891   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
 892   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
 893   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
 894   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 895
 896   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
 897   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
 898   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
 899   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
 900
 901   step2[16] = WRAPLOW(step1[16] + step1[17]);
 902   step2[17] = WRAPLOW(step1[16] - step1[17]);
 903   step2[18] = WRAPLOW(-step1[18] + step1[19]);
 904   step2[19] = WRAPLOW(step1[18] + step1[19]);
 905   step2[20] = WRAPLOW(step1[20] + step1[21]);
 906   step2[21] = WRAPLOW(step1[20] - step1[21]);
 907   step2[22] = WRAPLOW(-step1[22] + step1[23]);
 908   step2[23] = WRAPLOW(step1[22] + step1[23]);
 909   step2[24] = WRAPLOW(step1[24] + step1[25]);
 910   step2[25] = WRAPLOW(step1[24] - step1[25]);
 911   step2[26] = WRAPLOW(-step1[26] + step1[27]);
 912   step2[27] = WRAPLOW(step1[26] + step1[27]);
 913   step2[28] = WRAPLOW(step1[28] + step1[29]);
 914   step2[29] = WRAPLOW(step1[28] - step1[29]);
 915   step2[30] = WRAPLOW(-step1[30] + step1[31]);
 916   step2[31] = WRAPLOW(step1[30] + step1[31]);
 917
 918   // stage 3
 919   step1[0] = step2[0];
 920   step1[1] = step2[1];
 921   step1[2] = step2[2];
 922   step1[3] = step2[3];
 923
 924   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
 925   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
 926   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
 927   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
 928   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
 929   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
 930   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
 931   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 932
 933   step1[8] = WRAPLOW(step2[8] + step2[9]);
 934   step1[9] = WRAPLOW(step2[8] - step2[9]);
 935   step1[10] = WRAPLOW(-step2[10] + step2[11]);
 936   step1[11] = WRAPLOW(step2[10] + step2[11]);
 937   step1[12] = WRAPLOW(step2[12] + step2[13]);
 938   step1[13] = WRAPLOW(step2[12] - step2[13]);
 939   step1[14] = WRAPLOW(-step2[14] + step2[15]);
 940   step1[15] = WRAPLOW(step2[14] + step2[15]);
 941
 942   step1[16] = step2[16];
 943   step1[31] = step2[31];
 944   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
 945   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
 946   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
 947   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
 948   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
 949   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
 950   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
 951   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
 952   step1[19] = step2[19];
 953   step1[20] = step2[20];
 954   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
 955   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
 956   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
 957   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
 958   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
 959   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
 960   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
 961   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
 962   step1[23] = step2[23];
 963   step1[24] = step2[24];
 964   step1[27] = step2[27];
 965   step1[28] = step2[28];
 966
 967   // stage 4
 968   temp1 = (step1[0] + step1[1]) * cospi_16_64;
 969   temp2 = (step1[0] - step1[1]) * cospi_16_64;
 970   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
 971   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
 972   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
 973   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
 974   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
 975   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
 976   step2[4] = WRAPLOW(step1[4] + step1[5]);
 977   step2[5] = WRAPLOW(step1[4] - step1[5]);
 978   step2[6] = WRAPLOW(-step1[6] + step1[7]);
 979   step2[7] = WRAPLOW(step1[6] + step1[7]);
 980
 981   step2[8] = step1[8];
 982   step2[15] = step1[15];
 983   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
 984   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
 985   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
 986   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 987   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
 988   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
 989   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
 990   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 991   step2[11] = step1[11];
 992   step2[12] = step1[12];
 993
 994   step2[16] = WRAPLOW(step1[16] + step1[19]);
 995   step2[17] = WRAPLOW(step1[17] + step1[18]);
 996   step2[18] = WRAPLOW(step1[17] - step1[18]);
 997   step2[19] = WRAPLOW(step1[16] - step1[19]);
 998   step2[20] = WRAPLOW(-step1[20] + step1[23]);
 999   step2[21] = WRAPLOW(-step1[21] + step1[22]);
1000   step2[22] = WRAPLOW(step1[21] + step1[22]);
1001   step2[23] = WRAPLOW(step1[20] + step1[23]);
1002
1003   step2[24] = WRAPLOW(step1[24] + step1[27]);
1004   step2[25] = WRAPLOW(step1[25] + step1[26]);
1005   step2[26] = WRAPLOW(step1[25] - step1[26]);
1006   step2[27] = WRAPLOW(step1[24] - step1[27]);
1007   step2[28] = WRAPLOW(-step1[28] + step1[31]);
1008   step2[29] = WRAPLOW(-step1[29] + step1[30]);
1009   step2[30] = WRAPLOW(step1[29] + step1[30]);
1010   step2[31] = WRAPLOW(step1[28] + step1[31]);
1011
1012   // stage 5
1013   step1[0] = WRAPLOW(step2[0] + step2[3]);
1014   step1[1] = WRAPLOW(step2[1] + step2[2]);
1015   step1[2] = WRAPLOW(step2[1] - step2[2]);
1016   step1[3] = WRAPLOW(step2[0] - step2[3]);
1017   step1[4] = step2[4];
1018   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1019   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1020   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
1021   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1022   step1[7] = step2[7];
1023
1024   step1[8] = WRAPLOW(step2[8] + step2[11]);
1025   step1[9] = WRAPLOW(step2[9] + step2[10]);
1026   step1[10] = WRAPLOW(step2[9] - step2[10]);
1027   step1[11] = WRAPLOW(step2[8] - step2[11]);
1028   step1[12] = WRAPLOW(-step2[12] + step2[15]);
1029   step1[13] = WRAPLOW(-step2[13] + step2[14]);
1030   step1[14] = WRAPLOW(step2[13] + step2[14]);
1031   step1[15] = WRAPLOW(step2[12] + step2[15]);
1032
1033   step1[16] = step2[16];
1034   step1[17] = step2[17];
1035   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1036   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1037   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1038   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1039   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1040   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1041   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1042   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1043   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1044   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1045   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1046   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1047   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1048   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1049   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1050   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1051   step1[22] = step2[22];
1052   step1[23] = step2[23];
1053   step1[24] = step2[24];
1054   step1[25] = step2[25];
1055   step1[30] = step2[30];
1056   step1[31] = step2[31];
1057
1058   // stage 6
1059   step2[0] = WRAPLOW(step1[0] + step1[7]);
1060   step2[1] = WRAPLOW(step1[1] + step1[6]);
1061   step2[2] = WRAPLOW(step1[2] + step1[5]);
1062   step2[3] = WRAPLOW(step1[3] + step1[4]);
1063   step2[4] = WRAPLOW(step1[3] - step1[4]);
1064   step2[5] = WRAPLOW(step1[2] - step1[5]);
1065   step2[6] = WRAPLOW(step1[1] - step1[6]);
1066   step2[7] = WRAPLOW(step1[0] - step1[7]);
1067   step2[8] = step1[8];
1068   step2[9] = step1[9];
1069   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1070   temp2 = (step1[10] + step1[13]) * cospi_16_64;
1071   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1072   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1073   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1074   temp2 = (step1[11] + step1[12]) * cospi_16_64;
1075   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1076   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1077   step2[14] = step1[14];
1078   step2[15] = step1[15];
1079
1080   step2[16] = WRAPLOW(step1[16] + step1[23]);
1081   step2[17] = WRAPLOW(step1[17] + step1[22]);
1082   step2[18] = WRAPLOW(step1[18] + step1[21]);
1083   step2[19] = WRAPLOW(step1[19] + step1[20]);
1084   step2[20] = WRAPLOW(step1[19] - step1[20]);
1085   step2[21] = WRAPLOW(step1[18] - step1[21]);
1086   step2[22] = WRAPLOW(step1[17] - step1[22]);
1087   step2[23] = WRAPLOW(step1[16] - step1[23]);
1088
1089   step2[24] = WRAPLOW(-step1[24] + step1[31]);
1090   step2[25] = WRAPLOW(-step1[25] + step1[30]);
1091   step2[26] = WRAPLOW(-step1[26] + step1[29]);
1092   step2[27] = WRAPLOW(-step1[27] + step1[28]);
1093   step2[28] = WRAPLOW(step1[27] + step1[28]);
1094   step2[29] = WRAPLOW(step1[26] + step1[29]);
1095   step2[30] = WRAPLOW(step1[25] + step1[30]);
1096   step2[31] = WRAPLOW(step1[24] + step1[31]);
1097
1098   // stage 7
1099   step1[0] = WRAPLOW(step2[0] + step2[15]);
1100   step1[1] = WRAPLOW(step2[1] + step2[14]);
1101   step1[2] = WRAPLOW(step2[2] + step2[13]);
1102   step1[3] = WRAPLOW(step2[3] + step2[12]);
1103   step1[4] = WRAPLOW(step2[4] + step2[11]);
1104   step1[5] = WRAPLOW(step2[5] + step2[10]);
1105   step1[6] = WRAPLOW(step2[6] + step2[9]);
1106   step1[7] = WRAPLOW(step2[7] + step2[8]);
1107   step1[8] = WRAPLOW(step2[7] - step2[8]);
1108   step1[9] = WRAPLOW(step2[6] - step2[9]);
1109   step1[10] = WRAPLOW(step2[5] - step2[10]);
1110   step1[11] = WRAPLOW(step2[4] - step2[11]);
1111   step1[12] = WRAPLOW(step2[3] - step2[12]);
1112   step1[13] = WRAPLOW(step2[2] - step2[13]);
1113   step1[14] = WRAPLOW(step2[1] - step2[14]);
1114   step1[15] = WRAPLOW(step2[0] - step2[15]);
1115
1116   step1[16] = step2[16];
1117   step1[17] = step2[17];
1118   step1[18] = step2[18];
1119   step1[19] = step2[19];
1120   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1121   temp2 = (step2[20] + step2[27]) * cospi_16_64;
1122   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1123   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1124   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1125   temp2 = (step2[21] + step2[26]) * cospi_16_64;
1126   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1127   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1128   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1129   temp2 = (step2[22] + step2[25]) * cospi_16_64;
1130   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1131   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1132   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1133   temp2 = (step2[23] + step2[24]) * cospi_16_64;
1134   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1135   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1136   step1[28] = step2[28];
1137   step1[29] = step2[29];
1138   step1[30] = step2[30];
1139   step1[31] = step2[31];
1140
1141   // final stage
1142   output[0] = WRAPLOW(step1[0] + step1[31]);
1143   output[1] = WRAPLOW(step1[1] + step1[30]);
1144   output[2] = WRAPLOW(step1[2] + step1[29]);
1145   output[3] = WRAPLOW(step1[3] + step1[28]);
1146   output[4] = WRAPLOW(step1[4] + step1[27]);
1147   output[5] = WRAPLOW(step1[5] + step1[26]);
1148   output[6] = WRAPLOW(step1[6] + step1[25]);
1149   output[7] = WRAPLOW(step1[7] + step1[24]);
1150   output[8] = WRAPLOW(step1[8] + step1[23]);
1151   output[9] = WRAPLOW(step1[9] + step1[22]);
1152   output[10] = WRAPLOW(step1[10] + step1[21]);
1153   output[11] = WRAPLOW(step1[11] + step1[20]);
1154   output[12] = WRAPLOW(step1[12] + step1[19]);
1155   output[13] = WRAPLOW(step1[13] + step1[18]);
1156   output[14] = WRAPLOW(step1[14] + step1[17]);
1157   output[15] = WRAPLOW(step1[15] + step1[16]);
1158   output[16] = WRAPLOW(step1[15] - step1[16]);
1159   output[17] = WRAPLOW(step1[14] - step1[17]);
1160   output[18] = WRAPLOW(step1[13] - step1[18]);
1161   output[19] = WRAPLOW(step1[12] - step1[19]);
1162   output[20] = WRAPLOW(step1[11] - step1[20]);
1163   output[21] = WRAPLOW(step1[10] - step1[21]);
1164   output[22] = WRAPLOW(step1[9] - step1[22]);
1165   output[23] = WRAPLOW(step1[8] - step1[23]);
1166   output[24] = WRAPLOW(step1[7] - step1[24]);
1167   output[25] = WRAPLOW(step1[6] - step1[25]);
1168   output[26] = WRAPLOW(step1[5] - step1[26]);
1169   output[27] = WRAPLOW(step1[4] - step1[27]);
1170   output[28] = WRAPLOW(step1[3] - step1[28]);
1171   output[29] = WRAPLOW(step1[2] - step1[29]);
1172   output[30] = WRAPLOW(step1[1] - step1[30]);
1173   output[31] = WRAPLOW(step1[0] - step1[31]);
1174 }
1175
1176 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1177                               int stride) {
1178   int i, j;
1179   tran_low_t out[32 * 32];
1180   tran_low_t *outptr = out;
1181   tran_low_t temp_in[32], temp_out[32];
1182
1183   // Rows
1184   for (i = 0; i < 32; ++i) {
1185     int16_t zero_coeff[16];
1186     for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1187     for (j = 0; j < 8; ++j)
1188       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1189     for (j = 0; j < 4; ++j)
1190       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1191     for (j = 0; j < 2; ++j)
1192       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1193
1194     if (zero_coeff[0] | zero_coeff[1])
1195       idct32_c(input, outptr);
1196     else
1197       memset(outptr, 0, sizeof(tran_low_t) * 32);
1198     input += 32;
1199     outptr += 32;
1200   }
1201
1202   // Columns
1203   for (i = 0; i < 32; ++i) {
1204     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1205     idct32_c(temp_in, temp_out);
1206     for (j = 0; j < 32; ++j) {
1207       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1208                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1209     }
1210   }
1211 }
1212
1213 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
1214                              int stride) {
1215   int i, j;
1216   tran_low_t out[32 * 32] = { 0 };
1217   tran_low_t *outptr = out;
1218   tran_low_t temp_in[32], temp_out[32];
1219
1220   // Rows
1221   // Only upper-left 16x16 has non-zero coeff
1222   for (i = 0; i < 16; ++i) {
1223     idct32_c(input, outptr);
1224     input += 32;
1225     outptr += 32;
1226   }
1227
1228   // Columns
1229   for (i = 0; i < 32; ++i) {
1230     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1231     idct32_c(temp_in, temp_out);
1232     for (j = 0; j < 32; ++j) {
1233       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1234                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1235     }
1236   }
1237 }
1238
1239 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1240                             int stride) {
1241   int i, j;
1242   tran_low_t out[32 * 32] = { 0 };
1243   tran_low_t *outptr = out;
1244   tran_low_t temp_in[32], temp_out[32];
1245
1246   // Rows
1247   // Only upper-left 8x8 has non-zero coeff
1248   for (i = 0; i < 8; ++i) {
1249     idct32_c(input, outptr);
1250     input += 32;
1251     outptr += 32;
1252   }
1253
1254   // Columns
1255   for (i = 0; i < 32; ++i) {
1256     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1257     idct32_c(temp_in, temp_out);
1258     for (j = 0; j < 32; ++j) {
1259       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1260                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1261     }
1262   }
1263 }
1264
1265 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1266   int i, j;
1267   tran_high_t a1;
1268   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
1269
1270   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1271   a1 = ROUND_POWER_OF_TWO(out, 6);
1272
1273   for (j = 0; j < 32; ++j) {
1274     for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1275     dest += stride;
1276   }
1277 }
1278
1279 #if CONFIG_VP9_HIGHBITDEPTH
1280
1281 // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
1282 // transform amplify bits + 1 bit for contingency in rounding and quantizing
1283 #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
1284
1285 static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
1286                                               int size) {
1287   int i;
1288   for (i = 0; i < size; ++i)
1289     if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
1290   return 0;
1291 }
1292
1293 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1294                                  int stride, int bd) {
1295   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1296      0.5 shifts per pixel. */
1297   int i;
1298   tran_low_t output[16];
1299   tran_high_t a1, b1, c1, d1, e1;
1300   const tran_low_t *ip = input;
1301   tran_low_t *op = output;
1302   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1303
1304   for (i = 0; i < 4; i++) {
1305     a1 = ip[0] >> UNIT_QUANT_SHIFT;
1306     c1 = ip[1] >> UNIT_QUANT_SHIFT;
1307     d1 = ip[2] >> UNIT_QUANT_SHIFT;
1308     b1 = ip[3] >> UNIT_QUANT_SHIFT;
1309     a1 += c1;
1310     d1 -= b1;
1311     e1 = (a1 - d1) >> 1;
1312     b1 = e1 - b1;
1313     c1 = e1 - c1;
1314     a1 -= b1;
1315     d1 += c1;
1316     op[0] = HIGHBD_WRAPLOW(a1, bd);
1317     op[1] = HIGHBD_WRAPLOW(b1, bd);
1318     op[2] = HIGHBD_WRAPLOW(c1, bd);
1319     op[3] = HIGHBD_WRAPLOW(d1, bd);
1320     ip += 4;
1321     op += 4;
1322   }
1323
1324   ip = output;
1325   for (i = 0; i < 4; i++) {
1326     a1 = ip[4 * 0];
1327     c1 = ip[4 * 1];
1328     d1 = ip[4 * 2];
1329     b1 = ip[4 * 3];
1330     a1 += c1;
1331     d1 -= b1;
1332     e1 = (a1 - d1) >> 1;
1333     b1 = e1 - b1;
1334     c1 = e1 - c1;
1335     a1 -= b1;
1336     d1 += c1;
1337     dest[stride * 0] =
1338         highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1339     dest[stride * 1] =
1340         highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1341     dest[stride * 2] =
1342         highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1343     dest[stride * 3] =
1344         highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1345
1346     ip++;
1347     dest++;
1348   }
1349 }
1350
1351 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1352                                 int stride, int bd) {
1353   int i;
1354   tran_high_t a1, e1;
1355   tran_low_t tmp[4];
1356   const tran_low_t *ip = in;
1357   tran_low_t *op = tmp;
1358   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1359   (void)bd;
1360
1361   a1 = ip[0] >> UNIT_QUANT_SHIFT;
1362   e1 = a1 >> 1;
1363   a1 -= e1;
1364   op[0] = HIGHBD_WRAPLOW(a1, bd);
1365   op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1366
1367   ip = tmp;
1368   for (i = 0; i < 4; i++) {
1369     e1 = ip[0] >> 1;
1370     a1 = ip[0] - e1;
1371     dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1372     dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
1373     dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
1374     dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
1375     ip++;
1376     dest++;
1377   }
1378 }
1379
1380 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1381   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1382   tran_low_t x0 = input[0];
1383   tran_low_t x1 = input[1];
1384   tran_low_t x2 = input[2];
1385   tran_low_t x3 = input[3];
1386   (void)bd;
1387
1388   if (detect_invalid_highbd_input(input, 4)) {
1389 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1390     assert(0 && "invalid highbd txfm input");
1391 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1392     memset(output, 0, sizeof(*output) * 4);
1393     return;
1394   }
1395
1396   if (!(x0 | x1 | x2 | x3)) {
1397     memset(output, 0, 4 * sizeof(*output));
1398     return;
1399   }
1400
1401   s0 = sinpi_1_9 * x0;
1402   s1 = sinpi_2_9 * x0;
1403   s2 = sinpi_3_9 * x1;
1404   s3 = sinpi_4_9 * x2;
1405   s4 = sinpi_1_9 * x2;
1406   s5 = sinpi_2_9 * x3;
1407   s6 = sinpi_4_9 * x3;
1408   s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
1409
1410   s0 = s0 + s3 + s5;
1411   s1 = s1 - s4 - s6;
1412   s3 = s2;
1413   s2 = sinpi_3_9 * s7;
1414
1415   // 1-D transform scaling factor is sqrt(2).
1416   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1417   // + 1b (addition) = 29b.
1418   // Hence the output bit depth is 15b.
1419   output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
1420   output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
1421   output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1422   output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
1423 }
1424
1425 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1426   tran_low_t step[4];
1427   tran_high_t temp1, temp2;
1428   (void)bd;
1429
1430   if (detect_invalid_highbd_input(input, 4)) {
1431 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1432     assert(0 && "invalid highbd txfm input");
1433 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1434     memset(output, 0, sizeof(*output) * 4);
1435     return;
1436   }
1437
1438   // stage 1
1439   temp1 = (input[0] + input[2]) * cospi_16_64;
1440   temp2 = (input[0] - input[2]) * cospi_16_64;
1441   step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1442   step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1443   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1444   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1445   step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1446   step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1447
1448   // stage 2
1449   output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
1450   output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
1451   output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
1452   output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
1453 }
1454
1455 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1456                                  int stride, int bd) {
1457   int i, j;
1458   tran_low_t out[4 * 4];
1459   tran_low_t *outptr = out;
1460   tran_low_t temp_in[4], temp_out[4];
1461   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1462
1463   // Rows
1464   for (i = 0; i < 4; ++i) {
1465     vpx_highbd_idct4_c(input, outptr, bd);
1466     input += 4;
1467     outptr += 4;
1468   }
1469
1470   // Columns
1471   for (i = 0; i < 4; ++i) {
1472     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
1473     vpx_highbd_idct4_c(temp_in, temp_out, bd);
1474     for (j = 0; j < 4; ++j) {
1475       dest[j * stride + i] = highbd_clip_pixel_add(
1476           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1477     }
1478   }
1479 }
1480
1481 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1482                                 int stride, int bd) {
1483   int i;
1484   tran_high_t a1;
1485   tran_low_t out =
1486       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1487   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1488
1489   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1490   a1 = ROUND_POWER_OF_TWO(out, 4);
1491
1492   for (i = 0; i < 4; i++) {
1493     dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1494     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1495     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1496     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1497     dest += stride;
1498   }
1499 }
1500
1501 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1502   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1503   tran_low_t x0 = input[7];
1504   tran_low_t x1 = input[0];
1505   tran_low_t x2 = input[5];
1506   tran_low_t x3 = input[2];
1507   tran_low_t x4 = input[3];
1508   tran_low_t x5 = input[4];
1509   tran_low_t x6 = input[1];
1510   tran_low_t x7 = input[6];
1511   (void)bd;
1512
1513   if (detect_invalid_highbd_input(input, 8)) {
1514 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1515     assert(0 && "invalid highbd txfm input");
1516 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1517     memset(output, 0, sizeof(*output) * 8);
1518     return;
1519   }
1520
1521   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1522     memset(output, 0, 8 * sizeof(*output));
1523     return;
1524   }
1525
1526   // stage 1
1527   s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1528   s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1529   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1530   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1531   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1532   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1533   s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1534   s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1535
1536   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1537   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1538   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1539   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1540   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1541   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1542   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1543   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
1544
1545   // stage 2
1546   s0 = x0;
1547   s1 = x1;
1548   s2 = x2;
1549   s3 = x3;
1550   s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1551   s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1552   s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1553   s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1554
1555   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1556   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1557   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1558   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1559   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1560   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1561   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1562   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1563
1564   // stage 3
1565   s2 = cospi_16_64 * (x2 + x3);
1566   s3 = cospi_16_64 * (x2 - x3);
1567   s6 = cospi_16_64 * (x6 + x7);
1568   s7 = cospi_16_64 * (x6 - x7);
1569
1570   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1571   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1572   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1573   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1574
1575   output[0] = HIGHBD_WRAPLOW(x0, bd);
1576   output[1] = HIGHBD_WRAPLOW(-x4, bd);
1577   output[2] = HIGHBD_WRAPLOW(x6, bd);
1578   output[3] = HIGHBD_WRAPLOW(-x2, bd);
1579   output[4] = HIGHBD_WRAPLOW(x3, bd);
1580   output[5] = HIGHBD_WRAPLOW(-x7, bd);
1581   output[6] = HIGHBD_WRAPLOW(x5, bd);
1582   output[7] = HIGHBD_WRAPLOW(-x1, bd);
1583 }
1584
1585 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1586   tran_low_t step1[8], step2[8];
1587   tran_high_t temp1, temp2;
1588
1589   if (detect_invalid_highbd_input(input, 8)) {
1590 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1591     assert(0 && "invalid highbd txfm input");
1592 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1593     memset(output, 0, sizeof(*output) * 8);
1594     return;
1595   }
1596
1597   // stage 1
1598   step1[0] = input[0];
1599   step1[2] = input[4];
1600   step1[1] = input[2];
1601   step1[3] = input[6];
1602   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1603   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1604   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1605   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1606   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1607   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1608   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1609   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1610
1611   // stage 2 & stage 3 - even half
1612   vpx_highbd_idct4_c(step1, step1, bd);
1613
1614   // stage 2 - odd half
1615   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1616   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1617   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1618   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1619
1620   // stage 3 - odd half
1621   step1[4] = step2[4];
1622   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1623   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1624   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1625   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1626   step1[7] = step2[7];
1627
1628   // stage 4
1629   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1630   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1631   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1632   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1633   output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1634   output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1635   output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1636   output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1637 }
1638
1639 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1640                                  int stride, int bd) {
1641   int i, j;
1642   tran_low_t out[8 * 8];
1643   tran_low_t *outptr = out;
1644   tran_low_t temp_in[8], temp_out[8];
1645   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1646
1647   // First transform rows
1648   for (i = 0; i < 8; ++i) {
1649     vpx_highbd_idct8_c(input, outptr, bd);
1650     input += 8;
1651     outptr += 8;
1652   }
1653
1654   // Then transform columns
1655   for (i = 0; i < 8; ++i) {
1656     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1657     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1658     for (j = 0; j < 8; ++j) {
1659       dest[j * stride + i] = highbd_clip_pixel_add(
1660           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1661     }
1662   }
1663 }
1664
1665 void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
1666                                  int stride, int bd) {
1667   int i, j;
1668   tran_low_t out[8 * 8] = { 0 };
1669   tran_low_t *outptr = out;
1670   tran_low_t temp_in[8], temp_out[8];
1671   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1672
1673   // First transform rows
1674   // Only first 4 row has non-zero coefs
1675   for (i = 0; i < 4; ++i) {
1676     vpx_highbd_idct8_c(input, outptr, bd);
1677     input += 8;
1678     outptr += 8;
1679   }
1680
1681   // Then transform columns
1682   for (i = 0; i < 8; ++i) {
1683     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1684     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1685     for (j = 0; j < 8; ++j) {
1686       dest[j * stride + i] = highbd_clip_pixel_add(
1687           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1688     }
1689   }
1690 }
1691
1692 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1693                                 int stride, int bd) {
1694   int i, j;
1695   tran_high_t a1;
1696   tran_low_t out =
1697       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1698   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1699
1700   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1701   a1 = ROUND_POWER_OF_TWO(out, 5);
1702   for (j = 0; j < 8; ++j) {
1703     for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1704     dest += stride;
1705   }
1706 }
1707
1708 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1709   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1710   tran_high_t s9, s10, s11, s12, s13, s14, s15;
1711   tran_low_t x0 = input[15];
1712   tran_low_t x1 = input[0];
1713   tran_low_t x2 = input[13];
1714   tran_low_t x3 = input[2];
1715   tran_low_t x4 = input[11];
1716   tran_low_t x5 = input[4];
1717   tran_low_t x6 = input[9];
1718   tran_low_t x7 = input[6];
1719   tran_low_t x8 = input[7];
1720   tran_low_t x9 = input[8];
1721   tran_low_t x10 = input[5];
1722   tran_low_t x11 = input[10];
1723   tran_low_t x12 = input[3];
1724   tran_low_t x13 = input[12];
1725   tran_low_t x14 = input[1];
1726   tran_low_t x15 = input[14];
1727   (void)bd;
1728
1729   if (detect_invalid_highbd_input(input, 16)) {
1730 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1731     assert(0 && "invalid highbd txfm input");
1732 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1733     memset(output, 0, sizeof(*output) * 16);
1734     return;
1735   }
1736
1737   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1738         x13 | x14 | x15)) {
1739     memset(output, 0, 16 * sizeof(*output));
1740     return;
1741   }
1742
1743   // stage 1
1744   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1745   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1746   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1747   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1748   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1749   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1750   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1751   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1752   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1753   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1754   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1755   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1756   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1757   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1758   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1759   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1760
1761   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
1762   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
1763   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
1764   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
1765   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
1766   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
1767   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
1768   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
1769   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
1770   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
1771   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
1772   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
1773   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
1774   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
1775   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
1776   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
1777
1778   // stage 2
1779   s0 = x0;
1780   s1 = x1;
1781   s2 = x2;
1782   s3 = x3;
1783   s4 = x4;
1784   s5 = x5;
1785   s6 = x6;
1786   s7 = x7;
1787   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1788   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1789   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1790   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1791   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1792   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1793   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1794   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1795
1796   x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
1797   x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
1798   x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
1799   x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
1800   x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
1801   x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
1802   x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
1803   x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
1804   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
1805   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
1806   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
1807   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
1808   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
1809   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
1810   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
1811   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
1812
1813   // stage 3
1814   s0 = x0;
1815   s1 = x1;
1816   s2 = x2;
1817   s3 = x3;
1818   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1819   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1820   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1821   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1822   s8 = x8;
1823   s9 = x9;
1824   s10 = x10;
1825   s11 = x11;
1826   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1827   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1828   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1829   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1830
1831   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1832   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1833   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1834   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1835   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1836   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1837   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1838   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1839   x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
1840   x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
1841   x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
1842   x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
1843   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
1844   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
1845   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
1846   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
1847
1848   // stage 4
1849   s2 = (-cospi_16_64) * (x2 + x3);
1850   s3 = cospi_16_64 * (x2 - x3);
1851   s6 = cospi_16_64 * (x6 + x7);
1852   s7 = cospi_16_64 * (-x6 + x7);
1853   s10 = cospi_16_64 * (x10 + x11);
1854   s11 = cospi_16_64 * (-x10 + x11);
1855   s14 = (-cospi_16_64) * (x14 + x15);
1856   s15 = cospi_16_64 * (x14 - x15);
1857
1858   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1859   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1860   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1861   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1862   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
1863   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
1864   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
1865   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
1866
1867   output[0] = HIGHBD_WRAPLOW(x0, bd);
1868   output[1] = HIGHBD_WRAPLOW(-x8, bd);
1869   output[2] = HIGHBD_WRAPLOW(x12, bd);
1870   output[3] = HIGHBD_WRAPLOW(-x4, bd);
1871   output[4] = HIGHBD_WRAPLOW(x6, bd);
1872   output[5] = HIGHBD_WRAPLOW(x14, bd);
1873   output[6] = HIGHBD_WRAPLOW(x10, bd);
1874   output[7] = HIGHBD_WRAPLOW(x2, bd);
1875   output[8] = HIGHBD_WRAPLOW(x3, bd);
1876   output[9] = HIGHBD_WRAPLOW(x11, bd);
1877   output[10] = HIGHBD_WRAPLOW(x15, bd);
1878   output[11] = HIGHBD_WRAPLOW(x7, bd);
1879   output[12] = HIGHBD_WRAPLOW(x5, bd);
1880   output[13] = HIGHBD_WRAPLOW(-x13, bd);
1881   output[14] = HIGHBD_WRAPLOW(x9, bd);
1882   output[15] = HIGHBD_WRAPLOW(-x1, bd);
1883 }
1884
1885 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1886   tran_low_t step1[16], step2[16];
1887   tran_high_t temp1, temp2;
1888   (void)bd;
1889
1890   if (detect_invalid_highbd_input(input, 16)) {
1891 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1892     assert(0 && "invalid highbd txfm input");
1893 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1894     memset(output, 0, sizeof(*output) * 16);
1895     return;
1896   }
1897
1898   // stage 1
1899   step1[0] = input[0 / 2];
1900   step1[1] = input[16 / 2];
1901   step1[2] = input[8 / 2];
1902   step1[3] = input[24 / 2];
1903   step1[4] = input[4 / 2];
1904   step1[5] = input[20 / 2];
1905   step1[6] = input[12 / 2];
1906   step1[7] = input[28 / 2];
1907   step1[8] = input[2 / 2];
1908   step1[9] = input[18 / 2];
1909   step1[10] = input[10 / 2];
1910   step1[11] = input[26 / 2];
1911   step1[12] = input[6 / 2];
1912   step1[13] = input[22 / 2];
1913   step1[14] = input[14 / 2];
1914   step1[15] = input[30 / 2];
1915
1916   // stage 2
1917   step2[0] = step1[0];
1918   step2[1] = step1[1];
1919   step2[2] = step1[2];
1920   step2[3] = step1[3];
1921   step2[4] = step1[4];
1922   step2[5] = step1[5];
1923   step2[6] = step1[6];
1924   step2[7] = step1[7];
1925
1926   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1927   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1928   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1929   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1930
1931   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1932   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1933   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1934   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1935
1936   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1937   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1938   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1939   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1940
1941   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1942   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1943   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1944   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1945
1946   // stage 3
1947   step1[0] = step2[0];
1948   step1[1] = step2[1];
1949   step1[2] = step2[2];
1950   step1[3] = step2[3];
1951
1952   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1953   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1954   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1955   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1956   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1957   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1958   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1959   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1960
1961   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
1962   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
1963   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
1964   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
1965   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
1966   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
1967   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
1968   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
1969
1970   // stage 4
1971   temp1 = (step1[0] + step1[1]) * cospi_16_64;
1972   temp2 = (step1[0] - step1[1]) * cospi_16_64;
1973   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1974   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1975   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1976   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1977   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1978   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1979   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1980   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1981   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1982   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1983
1984   step2[8] = step1[8];
1985   step2[15] = step1[15];
1986   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1987   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1988   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1989   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1990   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1991   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1992   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1993   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1994   step2[11] = step1[11];
1995   step2[12] = step1[12];
1996
1997   // stage 5
1998   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
1999   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2000   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2001   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2002   step1[4] = step2[4];
2003   temp1 = (step2[6] - step2[5]) * cospi_16_64;
2004   temp2 = (step2[5] + step2[6]) * cospi_16_64;
2005   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2006   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2007   step1[7] = step2[7];
2008
2009   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2010   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2011   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2012   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2013   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2014   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2015   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2016   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2017
2018   // stage 6
2019   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2020   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2021   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2022   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2023   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2024   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2025   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2026   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2027   step2[8] = step1[8];
2028   step2[9] = step1[9];
2029   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2030   temp2 = (step1[10] + step1[13]) * cospi_16_64;
2031   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2032   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2033   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2034   temp2 = (step1[11] + step1[12]) * cospi_16_64;
2035   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2036   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2037   step2[14] = step1[14];
2038   step2[15] = step1[15];
2039
2040   // stage 7
2041   output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2042   output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2043   output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2044   output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2045   output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2046   output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2047   output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2048   output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2049   output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2050   output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2051   output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2052   output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2053   output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2054   output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2055   output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2056   output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2057 }
2058
2059 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
2060                                     int stride, int bd) {
2061   int i, j;
2062   tran_low_t out[16 * 16];
2063   tran_low_t *outptr = out;
2064   tran_low_t temp_in[16], temp_out[16];
2065   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2066
2067   // First transform rows
2068   for (i = 0; i < 16; ++i) {
2069     vpx_highbd_idct16_c(input, outptr, bd);
2070     input += 16;
2071     outptr += 16;
2072   }
2073
2074   // Then transform columns
2075   for (i = 0; i < 16; ++i) {
2076     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2077     vpx_highbd_idct16_c(temp_in, temp_out, bd);
2078     for (j = 0; j < 16; ++j) {
2079       dest[j * stride + i] = highbd_clip_pixel_add(
2080           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2081     }
2082   }
2083 }
2084
2085 void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8,
2086                                    int stride, int bd) {
2087   int i, j;
2088   tran_low_t out[16 * 16] = { 0 };
2089   tran_low_t *outptr = out;
2090   tran_low_t temp_in[16], temp_out[16];
2091   uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8);
2092
2093   // First transform rows. Since all non-zero dct coefficients are in
2094   // upper-left 8x8 area, we only need to calculate first 8 rows here.
2095   for (i = 0; i < 8; ++i) {
2096     vpx_highbd_idct16_c(input, outptr, bd);
2097     input += 16;
2098     outptr += 16;
2099   }
2100
2101   // Then transform columns
2102   for (i = 0; i < 16; ++i) {
2103     uint16_t *destT = dest;
2104     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2105     vpx_highbd_idct16_c(temp_in, temp_out, bd);
2106     for (j = 0; j < 16; ++j) {
2107       destT[i] = highbd_clip_pixel_add(destT[i],
2108                                        ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2109       destT += stride;
2110     }
2111   }
2112 }
2113
2114 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
2115                                    int stride, int bd) {
2116   int i, j;
2117   tran_low_t out[16 * 16] = { 0 };
2118   tran_low_t *outptr = out;
2119   tran_low_t temp_in[16], temp_out[16];
2120   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2121
2122   // First transform rows. Since all non-zero dct coefficients are in
2123   // upper-left 4x4 area, we only need to calculate first 4 rows here.
2124   for (i = 0; i < 4; ++i) {
2125     vpx_highbd_idct16_c(input, outptr, bd);
2126     input += 16;
2127     outptr += 16;
2128   }
2129
2130   // Then transform columns
2131   for (i = 0; i < 16; ++i) {
2132     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2133     vpx_highbd_idct16_c(temp_in, temp_out, bd);
2134     for (j = 0; j < 16; ++j) {
2135       dest[j * stride + i] = highbd_clip_pixel_add(
2136           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2137     }
2138   }
2139 }
2140
2141 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2142                                   int stride, int bd) {
2143   int i, j;
2144   tran_high_t a1;
2145   tran_low_t out =
2146       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2147   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2148
2149   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2150   a1 = ROUND_POWER_OF_TWO(out, 6);
2151   for (j = 0; j < 16; ++j) {
2152     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2153     dest += stride;
2154   }
2155 }
2156
2157 static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
2158                             int bd) {
2159   tran_low_t step1[32], step2[32];
2160   tran_high_t temp1, temp2;
2161   (void)bd;
2162
2163   if (detect_invalid_highbd_input(input, 32)) {
2164 #if CONFIG_COEFFICIENT_RANGE_CHECKING
2165     assert(0 && "invalid highbd txfm input");
2166 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
2167     memset(output, 0, sizeof(*output) * 32);
2168     return;
2169   }
2170
2171   // stage 1
2172   step1[0] = input[0];
2173   step1[1] = input[16];
2174   step1[2] = input[8];
2175   step1[3] = input[24];
2176   step1[4] = input[4];
2177   step1[5] = input[20];
2178   step1[6] = input[12];
2179   step1[7] = input[28];
2180   step1[8] = input[2];
2181   step1[9] = input[18];
2182   step1[10] = input[10];
2183   step1[11] = input[26];
2184   step1[12] = input[6];
2185   step1[13] = input[22];
2186   step1[14] = input[14];
2187   step1[15] = input[30];
2188
2189   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2190   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2191   step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2192   step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2193
2194   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2195   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2196   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2197   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2198
2199   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2200   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2201   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2202   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2203
2204   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2205   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2206   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2207   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2208
2209   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2210   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2211   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2212   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2213
2214   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2215   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2216   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2217   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2218
2219   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2220   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2221   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2222   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2223
2224   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2225   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2226   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2227   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2228
2229   // stage 2
2230   step2[0] = step1[0];
2231   step2[1] = step1[1];
2232   step2[2] = step1[2];
2233   step2[3] = step1[3];
2234   step2[4] = step1[4];
2235   step2[5] = step1[5];
2236   step2[6] = step1[6];
2237   step2[7] = step1[7];
2238
2239   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2240   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2241   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2242   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2243
2244   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2245   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2246   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2247   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2248
2249   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2250   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2251   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2252   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2253
2254   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2255   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2256   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2257   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2258
2259   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
2260   step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
2261   step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
2262   step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
2263   step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
2264   step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
2265   step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
2266   step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
2267   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
2268   step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
2269   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
2270   step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
2271   step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
2272   step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
2273   step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
2274   step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
2275
2276   // stage 3
2277   step1[0] = step2[0];
2278   step1[1] = step2[1];
2279   step1[2] = step2[2];
2280   step1[3] = step2[3];
2281
2282   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2283   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2284   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2285   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2286   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2287   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2288   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2289   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2290
2291   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
2292   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
2293   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
2294   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
2295   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
2296   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
2297   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
2298   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
2299
2300   step1[16] = step2[16];
2301   step1[31] = step2[31];
2302   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2303   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2304   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2305   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2306   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2307   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2308   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2309   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2310   step1[19] = step2[19];
2311   step1[20] = step2[20];
2312   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2313   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2314   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2315   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2316   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2317   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2318   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2319   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2320   step1[23] = step2[23];
2321   step1[24] = step2[24];
2322   step1[27] = step2[27];
2323   step1[28] = step2[28];
2324
2325   // stage 4
2326   temp1 = (step1[0] + step1[1]) * cospi_16_64;
2327   temp2 = (step1[0] - step1[1]) * cospi_16_64;
2328   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2329   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2330   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2331   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2332   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2333   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2334   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
2335   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
2336   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
2337   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
2338
2339   step2[8] = step1[8];
2340   step2[15] = step1[15];
2341   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2342   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2343   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2344   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2345   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2346   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2347   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2348   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2349   step2[11] = step1[11];
2350   step2[12] = step1[12];
2351
2352   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
2353   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
2354   step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
2355   step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
2356   step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
2357   step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
2358   step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
2359   step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
2360
2361   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
2362   step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
2363   step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
2364   step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
2365   step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
2366   step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
2367   step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
2368   step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
2369
2370   // stage 5
2371   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2372   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2373   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2374   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2375   step1[4] = step2[4];
2376   temp1 = (step2[6] - step2[5]) * cospi_16_64;
2377   temp2 = (step2[5] + step2[6]) * cospi_16_64;
2378   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2379   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2380   step1[7] = step2[7];
2381
2382   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2383   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2384   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2385   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2386   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2387   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2388   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2389   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2390
2391   step1[16] = step2[16];
2392   step1[17] = step2[17];
2393   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2394   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2395   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2396   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2397   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2398   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2399   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2400   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2401   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2402   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2403   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2404   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2405   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2406   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2407   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2408   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2409   step1[22] = step2[22];
2410   step1[23] = step2[23];
2411   step1[24] = step2[24];
2412   step1[25] = step2[25];
2413   step1[30] = step2[30];
2414   step1[31] = step2[31];
2415
2416   // stage 6
2417   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2418   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2419   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2420   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2421   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2422   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2423   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2424   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2425   step2[8] = step1[8];
2426   step2[9] = step1[9];
2427   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2428   temp2 = (step1[10] + step1[13]) * cospi_16_64;
2429   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2430   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2431   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2432   temp2 = (step1[11] + step1[12]) * cospi_16_64;
2433   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2434   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2435   step2[14] = step1[14];
2436   step2[15] = step1[15];
2437
2438   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
2439   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
2440   step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
2441   step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
2442   step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
2443   step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
2444   step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
2445   step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
2446
2447   step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
2448   step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
2449   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
2450   step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
2451   step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
2452   step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
2453   step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
2454   step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
2455
2456   // stage 7
2457   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2458   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2459   step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2460   step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2461   step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2462   step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2463   step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2464   step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2465   step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2466   step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2467   step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2468   step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2469   step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2470   step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2471   step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2472   step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2473
2474   step1[16] = step2[16];
2475   step1[17] = step2[17];
2476   step1[18] = step2[18];
2477   step1[19] = step2[19];
2478   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2479   temp2 = (step2[20] + step2[27]) * cospi_16_64;
2480   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2481   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2482   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2483   temp2 = (step2[21] + step2[26]) * cospi_16_64;
2484   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2485   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2486   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2487   temp2 = (step2[22] + step2[25]) * cospi_16_64;
2488   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2489   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2490   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2491   temp2 = (step2[23] + step2[24]) * cospi_16_64;
2492   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2493   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2494   step1[28] = step2[28];
2495   step1[29] = step2[29];
2496   step1[30] = step2[30];
2497   step1[31] = step2[31];
2498
2499   // final stage
2500   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
2501   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
2502   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
2503   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
2504   output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
2505   output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
2506   output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
2507   output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
2508   output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
2509   output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
2510   output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
2511   output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
2512   output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
2513   output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
2514   output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
2515   output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
2516   output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
2517   output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
2518   output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
2519   output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
2520   output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
2521   output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
2522   output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
2523   output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
2524   output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
2525   output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
2526   output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
2527   output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
2528   output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
2529   output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
2530   output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
2531   output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
2532 }
2533
2534 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2535                                      int stride, int bd) {
2536   int i, j;
2537   tran_low_t out[32 * 32];
2538   tran_low_t *outptr = out;
2539   tran_low_t temp_in[32], temp_out[32];
2540   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2541
2542   // Rows
2543   for (i = 0; i < 32; ++i) {
2544     tran_low_t zero_coeff[16];
2545     for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2546     for (j = 0; j < 8; ++j)
2547       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2548     for (j = 0; j < 4; ++j)
2549       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2550     for (j = 0; j < 2; ++j)
2551       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2552
2553     if (zero_coeff[0] | zero_coeff[1])
2554       highbd_idct32_c(input, outptr, bd);
2555     else
2556       memset(outptr, 0, sizeof(tran_low_t) * 32);
2557     input += 32;
2558     outptr += 32;
2559   }
2560
2561   // Columns
2562   for (i = 0; i < 32; ++i) {
2563     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2564     highbd_idct32_c(temp_in, temp_out, bd);
2565     for (j = 0; j < 32; ++j) {
2566       dest[j * stride + i] = highbd_clip_pixel_add(
2567           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2568     }
2569   }
2570 }
2571
2572 void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8,
2573                                     int stride, int bd) {
2574   int i, j;
2575   tran_low_t out[32 * 32] = { 0 };
2576   tran_low_t *outptr = out;
2577   tran_low_t temp_in[32], temp_out[32];
2578   uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8);
2579
2580   // Rows
2581   // Only upper-left 16x16 has non-zero coeff
2582   for (i = 0; i < 16; ++i) {
2583     highbd_idct32_c(input, outptr, bd);
2584     input += 32;
2585     outptr += 32;
2586   }
2587
2588   // Columns
2589   for (i = 0; i < 32; ++i) {
2590     uint16_t *destT = dest;
2591     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2592     highbd_idct32_c(temp_in, temp_out, bd);
2593     for (j = 0; j < 32; ++j) {
2594       destT[i] = highbd_clip_pixel_add(destT[i],
2595                                        ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2596       destT += stride;
2597     }
2598   }
2599 }
2600
2601 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2602                                    int stride, int bd) {
2603   int i, j;
2604   tran_low_t out[32 * 32] = { 0 };
2605   tran_low_t *outptr = out;
2606   tran_low_t temp_in[32], temp_out[32];
2607   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2608
2609   // Rows
2610   // Only upper-left 8x8 has non-zero coeff
2611   for (i = 0; i < 8; ++i) {
2612     highbd_idct32_c(input, outptr, bd);
2613     input += 32;
2614     outptr += 32;
2615   }
2616
2617   // Columns
2618   for (i = 0; i < 32; ++i) {
2619     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2620     highbd_idct32_c(temp_in, temp_out, bd);
2621     for (j = 0; j < 32; ++j) {
2622       dest[j * stride + i] = highbd_clip_pixel_add(
2623           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2624     }
2625   }
2626 }
2627
2628 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2629                                   int stride, int bd) {
2630   int i, j;
2631   int a1;
2632   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2633   tran_low_t out =
2634       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2635
2636   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2637   a1 = ROUND_POWER_OF_TWO(out, 6);
2638
2639   for (j = 0; j < 32; ++j) {
2640     for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2641     dest += stride;
2642   }
2643 }
2644
2645 #endif  // CONFIG_VP9_HIGHBITDEPTH